[llvm] [AMDGPU] Lazily emit waitcnts on function entry (PR #73122)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 1 05:33:53 PST 2023


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/73122

>From eb95e00de6aedc8b87b3de95aeb33584ca49bfa1 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 20 Jun 2023 16:50:43 +0100
Subject: [PATCH 1/2] [AMDGPU] Lazily emit waitcnts on function entry

Instead of emitting s_waitcnt 0 on entry to a function, just remember
that the counters are in an unknown state and let the pass emit the
waitcnts naturally before the first use of any register that might need
it.

Delay the s_waitcnts could have a small performance improvement, and in
some cases allows them to be combined with other waits which can have a
small code size improvement.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |   66 +-
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |   56 +-
 .../CodeGen/AMDGPU/GlobalISel/add.vni16.ll    |   86 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll   |   84 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |   30 +-
 .../GlobalISel/artifact-combiner-asserts.ll   |    4 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |  168 +-
 .../CodeGen/AMDGPU/GlobalISel/assert-align.ll |    3 +-
 .../br-constant-invalid-sgpr-copy.ll          |   16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll  |   80 +-
 ...legalization-artifact-combiner-dead-def.ll |    2 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |    8 +-
 .../GlobalISel/clamp-fmed3-const-combine.ll   |   16 +-
 .../GlobalISel/clamp-minmax-const-combine.ll  |   34 +-
 .../GlobalISel/combine-fma-add-fma-mul.ll     |  144 +-
 .../AMDGPU/GlobalISel/combine-fma-add-mul.ll  |  224 +--
 .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll  |  192 +--
 .../GlobalISel/combine-fma-sub-neg-mul.ll     |   72 +-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        |   94 +-
 .../GlobalISel/divergent-control-flow.ll      |   20 +-
 .../GlobalISel/dynamic-alloca-uniform.ll      |   21 +-
 .../GlobalISel/extractelement-stack-lower.ll  |    9 +-
 .../AMDGPU/GlobalISel/extractelement.i128.ll  |   70 +-
 .../AMDGPU/GlobalISel/extractelement.i16.ll   |  150 +-
 .../AMDGPU/GlobalISel/extractelement.i8.ll    |  325 ++--
 .../AMDGPU/GlobalISel/extractelement.ll       |  120 +-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     |  392 ++---
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll     |  458 ++---
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll     |  162 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  106 +-
 .../CodeGen/AMDGPU/GlobalISel/floor.f64.ll    |   32 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll    |  210 +--
 .../CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll  |   58 +-
 .../GlobalISel/fmed3-min-max-const-combine.ll |   72 +-
 .../CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll  |   82 +-
 .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll   |  120 +-
 .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll    |   19 +-
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |   86 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll   |  120 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  266 +--
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  248 +--
 .../GlobalISel/image-waterfall-loop-O0.ll     |    4 +-
 .../CodeGen/AMDGPU/GlobalISel/inline-asm.ll   |    8 +-
 .../AMDGPU/GlobalISel/insertelement.ll        |   13 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   24 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll  |   45 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll  |   45 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll    |   20 +-
 .../GlobalISel/llvm.amdgcn.fmul.legacy.ll     |  156 +-
 .../llvm.amdgcn.global.atomic.csub.ll         |   24 +-
 .../llvm.amdgcn.global.atomic.fadd.ll         |   30 +-
 .../GlobalISel/llvm.amdgcn.rsq.clamp.ll       |   32 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll |    8 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll     |    2 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll    |   78 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll    |   20 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll    |   16 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll   |   16 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll   |   16 +-
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |    8 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll     |    2 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll    |   78 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll    |   22 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll    |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll    |   46 +-
 .../AMDGPU/GlobalISel/load-constant.96.ll     |   73 +-
 .../AMDGPU/GlobalISel/load-local.128.ll       |  117 +-
 .../AMDGPU/GlobalISel/load-local.96.ll        |  108 +-
 .../AMDGPU/GlobalISel/load-unaligned.ll       |   52 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |    6 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  172 +-
 .../CodeGen/AMDGPU/GlobalISel/minmaxabs.ll    |   16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |   64 +-
 .../CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll    |   24 +-
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |    6 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |   30 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  174 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll   |   12 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll     |   34 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |   40 +-
 .../AMDGPU/GlobalISel/select-to-fmin-fmax.ll  |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |  144 +-
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |   58 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  176 +-
 .../CodeGen/AMDGPU/GlobalISel/shlN_add.ll     |   30 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll  |   42 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll     |   34 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |   40 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  174 +-
 .../AMDGPU/GlobalISel/strict_fma.f16.ll       |   24 +-
 .../AMDGPU/GlobalISel/strict_fma.f32.ll       |   16 +-
 .../AMDGPU/GlobalISel/strict_fma.f64.ll       |   16 +-
 .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll    |   56 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll   |   84 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll  |    8 +-
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |  170 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll   |    8 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll     |   30 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |   36 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll  |   42 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll     |   32 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |   38 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  170 +-
 .../CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll    |    4 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll   |   28 +-
 .../CodeGen/AMDGPU/GlobalISel/zextload.ll     |  125 +-
 .../abi-attribute-hints-undefined-behavior.ll |   28 +-
 .../test/CodeGen/AMDGPU/addrspacecast.gfx6.ll |   32 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   12 +-
 llvm/test/CodeGen/AMDGPU/agpr-remat.ll        |    2 +-
 llvm/test/CodeGen/AMDGPU/amd.endpgm.ll        |    3 -
 ...amdgpu-codegenprepare-fold-binop-select.ll |   34 +-
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll |   44 +-
 .../AMDGPU/amdgpu-cs-chain-preserve-cc.ll     |   36 +-
 .../CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll  |    2 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |   33 +-
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll  |   46 +-
 llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll    |    7 +-
 .../back-off-barrier-subtarget-feature.ll     |   19 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  371 ++--
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |   80 +-
 llvm/test/CodeGen/AMDGPU/bfi_nested.ll        |   16 +-
 llvm/test/CodeGen/AMDGPU/bfm.ll               |   12 +-
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        |   10 +-
 .../test/CodeGen/AMDGPU/branch-relax-spill.ll |    4 +-
 llvm/test/CodeGen/AMDGPU/bswap.ll             |   48 +-
 .../build-vector-packed-partial-undef.ll      |   64 +-
 llvm/test/CodeGen/AMDGPU/bypass-div.ll        |   30 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   50 +-
 llvm/test/CodeGen/AMDGPU/call-waitcnt.ll      |    2 -
 llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir |    1 -
 .../CodeGen/AMDGPU/calling-conventions.ll     |    4 +-
 .../AMDGPU/cgp-addressing-modes-flat.ll       |   48 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  145 +-
 llvm/test/CodeGen/AMDGPU/clamp-modifier.ll    |    8 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |    8 +-
 .../CodeGen/AMDGPU/combine-add-zext-xor.ll    |   24 +-
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |    4 +-
 .../CodeGen/AMDGPU/combine-vload-extract.ll   |   19 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |  404 ++---
 .../CodeGen/AMDGPU/computeNumSignBits-mul.ll  |   14 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |   28 +-
 .../AMDGPU/copy-vgpr-clobber-spill-vgpr.mir   |    2 -
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |    8 +-
 llvm/test/CodeGen/AMDGPU/cse-convergent.ll    |    4 +-
 llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll     |  227 ++-
 llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll   |   24 +-
 .../CodeGen/AMDGPU/dagcombine-fma-fmad.ll     |   12 +-
 .../dagcombine-v1i8-extractvecelt-crash.ll    |    4 +-
 .../AMDGPU/divergence-driven-buildvector.ll   |   68 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |    2 +-
 .../expand-scalar-carry-out-select-user.ll    |    8 +-
 llvm/test/CodeGen/AMDGPU/extract-load-i1.ll   |    3 +-
 llvm/test/CodeGen/AMDGPU/extract-lowbits.ll   |   26 +-
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  165 +-
 .../AMDGPU/extract-subvector-equal-length.ll  |    2 +-
 .../fast-unaligned-load-store.global.ll       |   43 +-
 .../fast-unaligned-load-store.private.ll      |  130 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  161 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |   42 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |   48 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll     |   48 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll     |   46 +-
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |   40 +-
 llvm/test/CodeGen/AMDGPU/fdiv.ll              |  628 +++----
 llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll    |  200 +--
 .../fix-frame-reg-in-custom-csr-spills.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |  280 ++--
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  744 +++++----
 .../CodeGen/AMDGPU/flat_atomics_i64_system.ll |  744 +++++----
 llvm/test/CodeGen/AMDGPU/fma.f16.ll           |   52 +-
 ...formation-fmul-distribute-denormal-mode.ll |   72 +-
 llvm/test/CodeGen/AMDGPU/fmax3.ll             |    8 +-
 llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll   |   80 +-
 .../test/CodeGen/AMDGPU/fmed3-cast-combine.ll |  247 ++-
 llvm/test/CodeGen/AMDGPU/fmin3.ll             |    8 +-
 llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll   |   80 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     | 1488 ++++++++---------
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |  701 ++++----
 .../CodeGen/AMDGPU/fneg-combines.legal.f16.ll |   18 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |  419 +++--
 .../fneg-fold-legalize-dag-increase-insts.ll  |    4 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |  210 ++-
 llvm/test/CodeGen/AMDGPU/fold-fabs.ll         |   19 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  192 +--
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll |   19 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |   92 +-
 llvm/test/CodeGen/AMDGPU/fpext-free.ll        |  114 +-
 llvm/test/CodeGen/AMDGPU/fpow.ll              |  120 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  289 ++--
 ...frame-setup-without-sgpr-to-vgpr-spills.ll |    4 +-
 .../CodeGen/AMDGPU/frexp-constant-fold.ll     |   50 +-
 llvm/test/CodeGen/AMDGPU/fshr.ll              |  104 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  350 ++--
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |   92 +-
 .../AMDGPU/fsub-as-fneg-src-modifier.ll       |  254 +--
 .../CodeGen/AMDGPU/function-args-inreg.ll     |  229 +--
 llvm/test/CodeGen/AMDGPU/function-args.ll     |  558 +++----
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |  339 ++--
 .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll   |    4 +-
 .../AMDGPU/gfx-callable-argument-types.ll     |  824 ++++-----
 .../gfx-callable-preserved-registers.ll       |   96 +-
 .../AMDGPU/gfx-callable-return-types.ll       |   82 +-
 .../AMDGPU/global_atomics_i32_system.ll       |  766 +++++----
 .../AMDGPU/global_atomics_i64_system.ll       |  766 +++++----
 .../AMDGPU/greedy-reverse-local-assignment.ll |    8 +-
 .../identical-subrange-spill-infloop.ll       |    2 +-
 llvm/test/CodeGen/AMDGPU/imm16.ll             |   90 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |   36 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    4 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |    2 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   11 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    |  556 +++---
 llvm/test/CodeGen/AMDGPU/known-never-nan.ll   |    4 +-
 llvm/test/CodeGen/AMDGPU/known-never-snan.ll  |   88 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |    9 +-
 .../AMDGPU/lds-global-non-entry-func.ll       |   87 +-
 .../AMDGPU/llvm.amdgcn.bitreplicate.ll        |   12 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll  |   12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll  |   24 +-
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |    3 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll   |   12 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll   |    2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll   |    2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll    |   24 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll   |    4 +-
 .../llvm.amdgcn.set.inactive.chain.arg.ll     |   84 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll    |    6 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.ll  |   12 +-
 .../test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll |   16 +-
 .../test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll |   16 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll   |   24 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          |  448 ++---
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  232 +--
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  220 +--
 llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll   |  120 +-
 llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll |    8 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     |  610 +++----
 llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll   |  200 +--
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  110 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  588 +++----
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  588 +++----
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  380 ++---
 llvm/test/CodeGen/AMDGPU/llvm.mulo.ll         |   32 +-
 llvm/test/CodeGen/AMDGPU/llvm.powi.ll         |   50 +-
 llvm/test/CodeGen/AMDGPU/load-hi16.ll         |  625 +++----
 llvm/test/CodeGen/AMDGPU/load-lo16.ll         |  504 ++----
 llvm/test/CodeGen/AMDGPU/load-local.128.ll    |  150 +-
 llvm/test/CodeGen/AMDGPU/load-local.96.ll     |  140 +-
 .../AMDGPU/load-range-metadata-assert.ll      |    5 +-
 .../AMDGPU/load-range-metadata-sign-bits.ll   |   25 +-
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll  |   96 +-
 .../local-stack-alloc-block-sp-reference.ll   |    6 +-
 .../loop-live-out-copy-undef-subrange.ll      |    3 +-
 .../AMDGPU/loop-on-function-argument.ll       |    4 +-
 .../test/CodeGen/AMDGPU/loop_exit_with_xor.ll |   14 +-
 ...p-var-out-of-divergent-loop-swdev407790.ll |    4 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll        |   91 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        |  342 ++--
 llvm/test/CodeGen/AMDGPU/mad-mix.ll           |  470 +++---
 llvm/test/CodeGen/AMDGPU/mad.u16.ll           |   24 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |  136 +-
 llvm/test/CodeGen/AMDGPU/med3-knownbits.ll    |   10 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |    6 +-
 .../test/CodeGen/AMDGPU/memcpy-fixed-align.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |   15 +-
 llvm/test/CodeGen/AMDGPU/minmax.ll            |   40 +-
 .../AMDGPU/module-lds-false-sharing.ll        |   12 +-
 ...uf-legalize-operands-non-ptr-intrinsics.ll |   74 +-
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |   74 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               |   10 +-
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   11 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |   12 +-
 llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll |    8 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |   12 +-
 .../AMDGPU/no-source-locations-in-prologue.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/offset-split-flat.ll |  351 ++--
 .../CodeGen/AMDGPU/offset-split-global.ll     |  366 ++--
 .../CodeGen/AMDGPU/partial-shift-shrink.ll    |   24 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |  319 ++--
 llvm/test/CodeGen/AMDGPU/preserve-hi16.ll     |  202 +--
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |    4 +-
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |   69 +-
 .../AMDGPU/prologue-epilogue-markers.ll       |    6 +-
 llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll |    9 +-
 llvm/test/CodeGen/AMDGPU/ptrmask.ll           |    8 +-
 llvm/test/CodeGen/AMDGPU/rcp-pattern.ll       |   78 +-
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |  706 ++++----
 llvm/test/CodeGen/AMDGPU/release-vgprs.mir    |  332 ++--
 llvm/test/CodeGen/AMDGPU/repeated-divisor.ll  |   78 +-
 .../AMDGPU/replace-store-of-insert-load.ll    |   12 +-
 llvm/test/CodeGen/AMDGPU/rotl.ll              |   10 +-
 llvm/test/CodeGen/AMDGPU/rotr.ll              |   10 +-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |  272 +--
 llvm/test/CodeGen/AMDGPU/rsq.f32.ll           |  144 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  216 +--
 llvm/test/CodeGen/AMDGPU/saddsat.ll           |   66 +-
 .../CodeGen/AMDGPU/schedule-addrspaces.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |   36 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |   12 +-
 .../CodeGen/AMDGPU/select-constant-xor.ll     |   22 +-
 .../AMDGPU/select-fabs-fneg-extract.f16.ll    |  270 +--
 .../select-fabs-fneg-extract.legal.f16.ll     |    2 +-
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  358 ++--
 .../test/CodeGen/AMDGPU/setcc-multiple-use.ll |    2 +-
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   23 +-
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |   24 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |   12 +-
 .../test/CodeGen/AMDGPU/shl_add_ptr_global.ll |    6 +-
 .../si-optimize-vgpr-live-range-dbg-instr.ll  |    2 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |   13 +-
 llvm/test/CodeGen/AMDGPU/skip-if-dead.ll      |    9 +-
 .../AMDGPU/spill-offset-calculation.ll        |   24 +-
 .../spill-vgpr-to-agpr-update-regscavenger.ll |    4 +-
 .../CodeGen/AMDGPU/spill-writelane-vgprs.ll   |    7 +-
 .../AMDGPU/srem-seteq-illegal-types.ll        |    8 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   36 +-
 llvm/test/CodeGen/AMDGPU/ssubsat.ll           |  100 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |  124 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |   36 +-
 llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll   |   44 +-
 llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll   |   48 +-
 llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll   |   28 +-
 llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll    |   56 +-
 llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll    |   32 +-
 llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll    |   32 +-
 llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll   |   68 +-
 llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll   |   48 +-
 llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll   |   28 +-
 llvm/test/CodeGen/AMDGPU/strict_fpext.ll      |   44 +-
 llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll    |   28 +-
 llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll   |   92 +-
 llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll   |   48 +-
 llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll   |   28 +-
 llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll  |   48 +-
 llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll  |   50 +-
 llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll  |    8 +-
 llvm/test/CodeGen/AMDGPU/swdev373493.ll       |    2 +-
 .../CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll    |    4 +-
 llvm/test/CodeGen/AMDGPU/trunc-combine.ll     |   16 +-
 .../AMDGPU/tuple-allocation-failure.ll        |    2 +-
 llvm/test/CodeGen/AMDGPU/uaddsat.ll           |   96 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |    8 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |   36 +-
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |   13 +-
 llvm/test/CodeGen/AMDGPU/uniform-cfg.ll       |    7 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |    7 +-
 .../AMDGPU/urem-seteq-illegal-types.ll        |   10 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |   27 +-
 llvm/test/CodeGen/AMDGPU/usubsat.ll           |  122 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   56 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |  469 +++---
 ...r-descriptor-waterfall-loop-idom-update.ll |    6 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |   12 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir     |    6 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir      |   15 +-
 .../AMDGPU/waitcnt-meta-instructions.mir      |    4 -
 llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir |   24 +-
 .../AMDGPU/waitcnt-preexisting-vscnt.mir      |   15 +-
 .../CodeGen/AMDGPU/waitcnt-preexisting.mir    |   12 +-
 llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir |    7 -
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   12 +-
 .../AMDGPU/whole-wave-register-copy.ll        |    2 +-
 .../AMDGPU/whole-wave-register-spill.ll       |    4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   48 +-
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll      |   16 +-
 366 files changed, 16891 insertions(+), 18914 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd7d..8fdfa0ac962f708 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -256,6 +256,7 @@ class WaitcntBrackets {
   unsigned hasPendingEvent(WaitEventType E) const {
     return PendingEvents & (1 << E);
   }
+  void setPendingEvent(WaitEventType E) { PendingEvents |= 1 << E; }
   unsigned hasPendingEvent(InstCounterType T) const {
     unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
     assert((HasPending != 0) == (getScoreRange(T) != 0));
@@ -295,6 +296,17 @@ class WaitcntBrackets {
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
+  void setScoreUB(InstCounterType T, unsigned Val) {
+    assert(T < NUM_INST_CNTS);
+    ScoreUBs[T] = Val;
+
+    if (T != EXP_CNT)
+      return;
+
+    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
+  }
+
 private:
   struct MergeInfo {
     unsigned OldLB;
@@ -310,17 +322,6 @@ class WaitcntBrackets {
     ScoreLBs[T] = Val;
   }
 
-  void setScoreUB(InstCounterType T, unsigned Val) {
-    assert(T < NUM_INST_CNTS);
-    ScoreUBs[T] = Val;
-
-    if (T != EXP_CNT)
-      return;
-
-    if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
-      ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
-  }
-
   void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
     if (GprNo < NUM_ALL_VGPRS) {
       VgprUB = std::max(VgprUB, GprNo);
@@ -563,7 +564,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   // PendingEvents and ScoreUB need to be update regardless if this event
   // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  PendingEvents |= 1 << E;
+  setPendingEvent(E);
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
@@ -1849,28 +1850,35 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   BlockInfos.clear();
   bool Modified = false;
 
-  if (!MFI->isEntryFunction()) {
-    // Wait for any outstanding memory operations that the input registers may
-    // depend on. We can't track them and it's better to do the wait after the
-    // costly call sequence.
-
-    // TODO: Could insert earlier and schedule more liberally with operations
-    // that only use caller preserved registers.
-    MachineBasicBlock &EntryBB = MF.front();
-    MachineBasicBlock::iterator I = EntryBB.begin();
-    for (MachineBasicBlock::iterator E = EntryBB.end();
-         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
-      ;
-    BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
-
-    Modified = true;
-  }
-
   // Keep iterating over the blocks in reverse post order, inserting and
   // updating s_waitcnt where needed, until a fix point is reached.
   for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
     BlockInfos.insert({MBB, BlockInfo()});
 
+  if (!MFI->isEntryFunction()) {
+    MachineBasicBlock &EntryBB = MF.front();
+    BlockInfo &EntryBI = BlockInfos.find(&EntryBB)->second;
+    EntryBI.Incoming = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+    WaitcntBrackets &Brackets = *EntryBI.Incoming;
+
+    unsigned UB = 0;
+    for (auto T : inst_counter_types())
+      UB = std::max(UB, Brackets.getWaitCountMax(T));
+
+    for (auto T : inst_counter_types())
+      Brackets.setScoreUB(T, UB);
+
+    if (ST->hasVscnt()) {
+      Brackets.setPendingEvent(VMEM_READ_ACCESS);
+      Brackets.setPendingEvent(VMEM_WRITE_ACCESS);
+      Brackets.setPendingEvent(SCRATCH_WRITE_ACCESS);
+    } else {
+      Brackets.setPendingEvent(VMEM_ACCESS);
+    }
+    for (unsigned I = LDS_ACCESS; I < NUM_WAIT_EVENTS; ++I)
+      Brackets.setPendingEvent(WaitEventType(I));
+  }
+
   std::unique_ptr<WaitcntBrackets> Brackets;
   bool Repeat;
   do {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index e4cababfe1c919a..5b9e37eb85ff8aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -8,29 +8,29 @@
 define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; GFX7-LABEL: v_add_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %a, %b
   ret <2 x i16> %add
@@ -39,7 +39,6 @@ define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
 define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 ; GFX7-LABEL: v_add_v2i16_fneg_lhs:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -47,27 +46,28 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
@@ -78,7 +78,6 @@ define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 ; GFX7-LABEL: v_add_v2i16_fneg_rhs:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
@@ -86,27 +85,28 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
@@ -117,7 +117,6 @@ define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX7-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -130,28 +129,29 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %neg.b = fneg <2 x half> %b
@@ -164,31 +164,31 @@ define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
 ; GFX8-NEXT:    v_add_u16_e32 v2, 0xffc0, v0
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0xffc0, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %a, <i16 -64, i16 -64>
   ret <2 x i16> %add
@@ -197,31 +197,31 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4ffc0
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX8-NEXT:    v_add_u16_e32 v1, 0xffc0, v0
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_lo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0x4ffc0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %a, <i16 -64, i16 4>
   ret <2 x i16> %add
@@ -230,31 +230,31 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc00004
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
 ; GFX8-NEXT:    v_add_u16_e32 v2, 4, v0
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0xffc00004, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %a, <i16 4, i16 -64>
   ret <2 x i16> %add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index ab95c226b08b022..0bab10dc4d32311 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -5,7 +5,6 @@
 define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
@@ -33,12 +32,11 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    flat_store_short v[4:5], v7
 ; GFX8-NEXT:    flat_store_short v[0:1], v8
 ; GFX8-NEXT:    flat_store_short v[2:3], v6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_ushort v8, v[2:3], off
@@ -59,7 +57,7 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX9-NEXT:    global_store_short v[4:5], v0, off
 ; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:2
 ; GFX9-NEXT:    global_store_short v[4:5], v2, off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <3 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <3 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -71,18 +69,18 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <3 x i16> @add_v3i16_arg(<3 x i16> %a, <3 x i16> %b) {
 ; GFX8-LABEL: add_v3i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_add_u16_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v3i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <3 x i16> %a, %b
   ret <3 x i16> %add
@@ -91,7 +89,6 @@ define <3 x i16> @add_v3i16_arg(<3 x i16> %a, <3 x i16> %b) {
 define void @add_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -102,19 +99,18 @@ define void @add_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v6, v8
 ; GFX9-NEXT:    v_pk_add_u16 v1, v7, v9
 ; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <4 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <4 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -126,20 +122,20 @@ define void @add_v4i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <4 x i16> @add_v4i16_arg(<4 x i16> %a, <4 x i16> %b) {
 ; GFX8-LABEL: add_v4i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v2, v1, v3
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v4i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <4 x i16> %a, %b
   ret <4 x i16> %add
@@ -148,7 +144,6 @@ define <4 x i16> @add_v4i16_arg(<4 x i16> %a, <4 x i16> %b) {
 define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v5i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
@@ -198,12 +193,11 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    flat_store_short v[2:3], v13
 ; GFX8-NEXT:    flat_store_short v[6:7], v14
 ; GFX8-NEXT:    flat_store_short v[8:9], v10
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v5i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
@@ -239,7 +233,7 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX9-NEXT:    global_store_short v[4:5], v1, off offset:4
 ; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v1, off offset:6
 ; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <5 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <5 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -251,7 +245,6 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) {
 ; GFX8-LABEL: add_v5i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v4
@@ -259,14 +252,15 @@ define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v5i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <5 x i16> %a, %b
   ret <5 x i16> %add
@@ -275,7 +269,6 @@ define <5 x i16> @add_v5i16_arg(<5 x i16> %a, <5 x i16> %b) {
 define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx3 v[6:8], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -289,12 +282,11 @@ define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    v_or_b32_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v7, v2
 ; GFX8-NEXT:    flat_store_dwordx3 v[4:5], v[0:2]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx3 v[9:11], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -302,7 +294,7 @@ define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX9-NEXT:    v_pk_add_u16 v1, v7, v10
 ; GFX9-NEXT:    v_pk_add_u16 v2, v8, v11
 ; GFX9-NEXT:    global_store_dwordx3 v[4:5], v[0:2], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <6 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <6 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -314,7 +306,6 @@ define void @add_v6i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) {
 ; GFX8-LABEL: add_v6i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v4
@@ -324,14 +315,15 @@ define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) {
 ; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v6i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <6 x i16> %a, %b
   ret <6 x i16> %add
@@ -340,7 +332,6 @@ define <6 x i16> @add_v6i16_arg(<6 x i16> %a, <6 x i16> %b) {
 define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: addv_7i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
@@ -412,12 +403,11 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    v_add_u16_e32 v10, v22, v10
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_short v[0:1], v10
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: addv_7i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v6, v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v7, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
@@ -468,7 +458,7 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX9-NEXT:    global_store_short v[4:5], v2, off offset:8
 ; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v2, off offset:10
 ; GFX9-NEXT:    global_store_short v[4:5], v8, off offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <7 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <7 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -480,7 +470,6 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) {
 ; GFX8-LABEL: add_v7i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v8, v0, v4
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v4, v1, v5
@@ -491,15 +480,16 @@ define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v7i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v5
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v6
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <7 x i16> %a, %b
   ret <7 x i16> %add
@@ -508,7 +498,6 @@ define <7 x i16> @add_v7i16_arg(<7 x i16> %a, <7 x i16> %b) {
 define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v9i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -536,12 +525,11 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX8-NEXT:    v_or_b32_e32 v3, v10, v9
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    flat_store_short v[14:15], v11
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v9i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:16
@@ -555,7 +543,7 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 ; GFX9-NEXT:    v_pk_add_u16 v6, v14, v15
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <9 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <9 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -567,7 +555,6 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
 define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) {
 ; GFX8-LABEL: add_v9i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v10, v0, v5
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v5, v1, v6
@@ -581,16 +568,17 @@ define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v10, v0
 ; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_add_u16_e32 v4, v4, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v9i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v5
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v6
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v7
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v8
 ; GFX9-NEXT:    v_pk_add_u16 v4, v4, v9
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <9 x i16> %a, %b
   ret <9 x i16> %add
@@ -599,7 +587,6 @@ define <9 x i16> @add_v9i16_arg(<9 x i16> %a, <9 x i16> %b) {
 define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v10i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -629,12 +616,11 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_dword v[0:1], v6
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v10i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    global_load_dword v14, v[0:1], off offset:16
@@ -648,7 +634,7 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX9-NEXT:    v_pk_add_u16 v6, v14, v15
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    global_store_dword v[4:5], v6, off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <10 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <10 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -660,7 +646,6 @@ define void @add_v10i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v11i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 18, v0
@@ -710,12 +695,11 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX8-NEXT:    flat_store_short v[14:15], v13
 ; GFX8-NEXT:    flat_store_short v[16:17], v18
 ; GFX8-NEXT:    flat_store_short v[6:7], v19
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v11i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off
 ; GFX9-NEXT:    global_load_ushort v14, v[0:1], off offset:16
 ; GFX9-NEXT:    global_load_ushort v15, v[2:3], off offset:16
@@ -743,7 +727,7 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX9-NEXT:    global_store_short v[4:5], v0, off offset:16
 ; GFX9-NEXT:    global_store_short_d16_hi v[4:5], v0, off offset:18
 ; GFX9-NEXT:    global_store_short v[4:5], v6, off offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <11 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <11 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -755,7 +739,6 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) {
 ; GFX8-LABEL: add_v11i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v12, v0, v6
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v6, v1, v7
@@ -772,17 +755,18 @@ define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
 ; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
 ; GFX8-NEXT:    v_add_u16_e32 v5, v5, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v11i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v6
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v7
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v8
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v9
 ; GFX9-NEXT:    v_pk_add_u16 v4, v4, v10
 ; GFX9-NEXT:    v_pk_add_u16 v5, v5, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <11 x i16> %a, %b
   ret <11 x i16> %add
@@ -791,7 +775,6 @@ define <11 x i16> @add_v11i16_arg(<11 x i16> %a, <11 x i16> %b) {
 define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
 ; GFX8-LABEL: add_v12i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
 ; GFX8-NEXT:    flat_load_dwordx4 v[10:13], v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
@@ -824,12 +807,11 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX8-NEXT:    v_or_b32_e32 v7, v8, v9
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[6:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v12i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:16
@@ -844,7 +826,7 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 ; GFX9-NEXT:    v_pk_add_u16 v7, v15, v17
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a = load <12 x i16>, ptr addrspace(1) %ptra, align 4
   %b = load <12 x i16>, ptr addrspace(1) %ptrb, align 4
@@ -856,7 +838,6 @@ define void @add_v12i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
 define <12 x i16> @add_v12i16_arg(<12 x i16> %a, <12 x i16> %b) {
 ; GFX8-LABEL: add_v12i16_arg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v12, v0, v6
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e32 v6, v1, v7
@@ -875,17 +856,18 @@ define <12 x i16> @add_v12i16_arg(<12 x i16> %a, <12 x i16> %b) {
 ; GFX8-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v12, v0
 ; GFX8-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_v12i16_arg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v6
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v7
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v8
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v9
 ; GFX9-NEXT:    v_pk_add_u16 v4, v4, v10
 ; GFX9-NEXT:    v_pk_add_u16 v5, v5, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add = add <12 x i16> %a, %b
   ret <12 x i16> %add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
index a1013f3803e781a..f580b03df145885 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll
@@ -6,26 +6,26 @@
 define i32 @v_uaddo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_uaddo_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddo_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %add = extractvalue {i32, i1} %uaddo, 0
@@ -38,32 +38,32 @@ define i32 @v_uaddo_i32(i32 %a, i32 %b) {
 define i64 @v_uaddo_i64(i64 %a, i64 %b) {
 ; GFX7-LABEL: v_uaddo_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddo_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %add = extractvalue {i64, i1} %uaddo, 0
@@ -76,7 +76,6 @@ define i64 @v_uaddo_i64(i64 %a, i64 %b) {
 define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_uaddo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -84,11 +83,11 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -96,15 +95,16 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
   %add = extractvalue {i8, i1} %uaddo, 0
@@ -117,7 +117,6 @@ define i8 @v_uaddo_i8(i8 %a, i8 %b) {
 define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_uaddo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -125,11 +124,11 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -137,11 +136,11 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -149,6 +148,7 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i7, i1} @llvm.uadd.with.overflow.i7(i7 %a, i7 %b)
   %add = extractvalue {i7, i1} %uaddo, 0
@@ -161,35 +161,35 @@ define i7 @v_uaddo_i7(i7 %a, i7 %b) {
 define <2 x i32> @v_uaddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-LABEL: v_uaddo_v2i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddo_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {<2 x i32>, <2 x i1>} @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
   %add = extractvalue {<2 x i32>, <2 x i1>} %uaddo, 0
@@ -202,35 +202,35 @@ define <2 x i32> @v_uaddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 define i32 @v_saddo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_saddo_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX7-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddo_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX8-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
   %add = extractvalue {i32, i1} %saddo, 0
@@ -243,7 +243,6 @@ define i32 @v_saddo_i32(i32 %a, i32 %b) {
 define i64 @v_saddo_i64(i64 %a, i64 %b) {
 ; GFX7-LABEL: v_saddo_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -252,11 +251,11 @@ define i64 @v_saddo_i64(i64 %a, i64 %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddo_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -265,11 +264,11 @@ define i64 @v_saddo_i64(i64 %a, i64 %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -278,6 +277,7 @@ define i64 @v_saddo_i64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %add = extractvalue {i64, i1} %saddo, 0
@@ -290,7 +290,6 @@ define i64 @v_saddo_i64(i64 %a, i64 %b) {
 define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-LABEL: v_saddo_v2i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v5, vcc, v1, v3
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -303,11 +302,11 @@ define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddo_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -320,11 +319,11 @@ define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v4, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -337,6 +336,7 @@ define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u32_e32 v0, v4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v5, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
   %add = extractvalue {<2 x i32>, <2 x i1>} %saddo, 0
@@ -349,7 +349,6 @@ define <2 x i32> @v_saddo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 define i8 @v_saddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_saddo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -359,11 +358,11 @@ define i8 @v_saddo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -373,11 +372,11 @@ define i8 @v_saddo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -385,6 +384,7 @@ define i8 @v_saddo_i8(i8 %a, i8 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
   %add = extractvalue {i8, i1} %saddo, 0
@@ -397,7 +397,6 @@ define i8 @v_saddo_i8(i8 %a, i8 %b) {
 define i7 @v_saddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_saddo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -407,11 +406,11 @@ define i7 @v_saddo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -421,11 +420,11 @@ define i7 @v_saddo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -435,6 +434,7 @@ define i7 @v_saddo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i7, i1} @llvm.sadd.with.overflow.i7(i7 %a, i7 %b)
   %add = extractvalue {i7, i1} %saddo, 0
@@ -549,7 +549,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_uaddo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -557,11 +556,11 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_uaddo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -569,15 +568,16 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_uaddo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
   %add = extractvalue {i8, i1} %uaddo, 0
@@ -590,7 +590,6 @@ define i8 @s_uaddo_i8(i8 %a, i8 %b) {
 define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_uaddo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -598,11 +597,11 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_uaddo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
@@ -610,11 +609,11 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_uaddo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
@@ -622,6 +621,7 @@ define i7 @s_uaddo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %uaddo = call {i7, i1} @llvm.uadd.with.overflow.i7(i7 %a, i7 %b)
   %add = extractvalue {i7, i1} %uaddo, 0
@@ -805,7 +805,6 @@ define amdgpu_ps <2 x i32> @s_saddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 define i8 @s_saddo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_saddo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -815,11 +814,11 @@ define i8 @s_saddo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_saddo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -829,11 +828,11 @@ define i8 @s_saddo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_saddo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -841,6 +840,7 @@ define i8 @s_saddo_i8(i8 %a, i8 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
   %add = extractvalue {i8, i1} %saddo, 0
@@ -853,7 +853,6 @@ define i8 @s_saddo_i8(i8 %a, i8 %b) {
 define i7 @s_saddo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_saddo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -863,11 +862,11 @@ define i7 @s_saddo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_saddo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -877,11 +876,11 @@ define i7 @s_saddo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_saddo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -891,6 +890,7 @@ define i7 @s_saddo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_add_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %saddo = call {i7, i1} @llvm.sadd.with.overflow.i7(i7 %a, i7 %b)
   %add = extractvalue {i7, i1} %saddo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 4be00fedb972e70..c93483b59371837 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -98,16 +98,16 @@ define amdgpu_ps { i32, i32 } @s_andn2_i32_multi_foldable_use(i32 inreg %src0, i
 define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
 ; GCN-LABEL: v_andn2_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v1, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i32 %src1, -1
   %and = and i32 %src0, %not.src1
@@ -246,20 +246,20 @@ define amdgpu_ps { i64, i64 } @s_andn2_i64_multi_use(i64 inreg %src0, i64 inreg
 define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
 ; GCN-LABEL: v_andn2_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-NEXT:    v_not_b32_e32 v3, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v3, v3
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i64 %src1, -1
   %and = and i64 %src0, %not.src1
@@ -442,16 +442,16 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i
 define i16 @v_andn2_i16(i16 %src0, i16 %src1) {
 ; GCN-LABEL: v_andn2_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i16 %src1, -1
   %and = and i16 %src0, %not.src1
@@ -654,7 +654,6 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg
 define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -664,20 +663,21 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_andn2_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
   %and = and <2 x i16> %src0, %not.src1
@@ -865,7 +865,6 @@ define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -881,24 +880,25 @@ define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, v1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_andn2_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -11, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_v3i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -11, v3
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
   %and = and <3 x i16> %src0, %not.src1
@@ -1098,7 +1098,6 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg
 define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1117,24 +1116,25 @@ define <4 x i16> @v_andn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, v1, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_andn2_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_andn2_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
   %and = and <4 x i16> %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index 6dce6c1852af9b1..3559c910a7980cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -4,7 +4,6 @@
 define hidden <2 x i64> @icmp_v2i32_sext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-LABEL: icmp_v2i32_sext_to_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -13,6 +12,7 @@ define hidden <2 x i64> @icmp_v2i32_sext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-NEXT:    v_bfe_i32 v2, v1, 0, 1
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %arg, zeroinitializer
   %sext = sext <2 x i1> %cmp to <2 x i64>
@@ -22,7 +22,6 @@ define hidden <2 x i64> @icmp_v2i32_sext_to_v2i64(<2 x i32> %arg) {
 define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-LABEL: icmp_v2i32_zext_to_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -31,6 +30,7 @@ define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_and_b32_e32 v2, 1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %arg, zeroinitializer
   %sext = zext <2 x i1> %cmp to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ec832ed0f7f3aa6..45f4f1e98594533 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -8,31 +8,31 @@
 define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 ; GFX6-LABEL: v_ashr_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i8 %value, %amount
   ret i8 %result
@@ -41,30 +41,30 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
 define i8 @v_ashr_i8_7(i8 %value) {
 ; GFX6-LABEL: v_ashr_i8_7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i8_7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i8_7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 7
 ; GFX9-NEXT:    v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 7, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i8 %value, 7
   ret i8 %result
@@ -121,18 +121,18 @@ define amdgpu_ps i8 @s_ashr_i8_7(i8 inreg %value) {
 define i24 @v_ashr_i24(i24 %value, i24 %amount) {
 ; GCN-LABEL: v_ashr_i24:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i24 %value, %amount
   ret i24 %result
@@ -141,16 +141,16 @@ define i24 @v_ashr_i24(i24 %value, i24 %amount) {
 define i24 @v_ashr_i24_7(i24 %value) {
 ; GCN-LABEL: v_ashr_i24_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i24_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i24 %value, 7
   ret i24 %result
@@ -191,14 +191,14 @@ define amdgpu_ps i24 @s_ashr_i24_7(i24 inreg %value) {
 define i32 @v_ashr_i32(i32 %value, i32 %amount) {
 ; GCN-LABEL: v_ashr_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i32 %value, %amount
   ret i32 %result
@@ -207,14 +207,14 @@ define i32 @v_ashr_i32(i32 %value, i32 %amount) {
 define i32 @v_ashr_i32_31(i32 %value) {
 ; GCN-LABEL: v_ashr_i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i32 %value, 31
   ret i32 %result
@@ -291,16 +291,16 @@ define amdgpu_ps float @ashr_i32_vs(i32 %value, i32 inreg %amount) {
 define <2 x i32> @v_ashr_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 ; GCN-LABEL: v_ashr_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i32> %value, %amount
   ret <2 x i32> %result
@@ -309,16 +309,16 @@ define <2 x i32> @v_ashr_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 define <2 x i32> @v_ashr_v2i32_31(<2 x i32> %value) {
 ; GCN-LABEL: v_ashr_v2i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i32> %value, <i32 31, i32 31>
   ret <2 x i32> %result
@@ -343,18 +343,18 @@ define amdgpu_ps <2 x i32> @s_ashr_v2i32(<2 x i32> inreg %value, <2 x i32> inreg
 define <3 x i32> @v_ashr_v3i32(<3 x i32> %value, <3 x i32> %amount) {
 ; GCN-LABEL: v_ashr_v3i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v3, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, v4, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, v5, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v3, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, v4, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, v5, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <3 x i32> %value, %amount
   ret <3 x i32> %result
@@ -381,20 +381,20 @@ define amdgpu_ps <3 x i32> @s_ashr_v3i32(<3 x i32> inreg %value, <3 x i32> inreg
 define <4 x i32> @v_ashr_v4i32(<4 x i32> %value, <4 x i32> %amount) {
 ; GCN-LABEL: v_ashr_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <4 x i32> %value, %amount
   ret <4 x i32> %result
@@ -423,22 +423,22 @@ define amdgpu_ps <4 x i32> @s_ashr_v4i32(<4 x i32> inreg %value, <4 x i32> inreg
 define <5 x i32> @v_ashr_v5i32(<5 x i32> %value, <5 x i32> %amount) {
 ; GCN-LABEL: v_ashr_v5i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v5, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, v6, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, v7, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, v8, v3
 ; GCN-NEXT:    v_ashrrev_i32_e32 v4, v9, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, v5, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, v6, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, v7, v2
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v3, v8, v3
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v4, v9, v4
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <5 x i32> %value, %amount
   ret <5 x i32> %result
@@ -469,7 +469,6 @@ define amdgpu_ps <5 x i32> @s_ashr_v5i32(<5 x i32> inreg %value, <5 x i32> inreg
 define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-LABEL: v_ashr_v16i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, v16, v0
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, v17, v1
@@ -488,11 +487,11 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v14, v30, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v15, v16, v15
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ashr_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v0, v16, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, v17, v1
@@ -511,11 +510,11 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v14, v30, v14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v15, v31, v15
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ashr_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v0, v16, v0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v1, v17, v1
@@ -534,6 +533,7 @@ define <16 x i32> @v_ashr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v14, v30, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v15, v31, v15
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <16 x i32> %value, %amount
   ret <16 x i32> %result
@@ -586,28 +586,28 @@ define amdgpu_ps <16 x i32> @s_ashr_v16i32(<16 x i32> inreg %value, <16 x i32> i
 define i16 @v_ashr_i16(i16 %value, i16 %amount) {
 ; GFX6-LABEL: v_ashr_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i16 %value, %amount
   ret i16 %result
@@ -616,27 +616,27 @@ define i16 @v_ashr_i16(i16 %value, i16 %amount) {
 define i16 @v_ashr_i16_15(i16 %value) {
 ; GFX6-LABEL: v_ashr_i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 15, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 15, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i16 %value, 15
   ret i16 %result
@@ -746,33 +746,33 @@ define amdgpu_ps half @ashr_i16_vs(i16 %value, i16 inreg %amount) {
 define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, v1, v0
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i16> %value, %amount
   ret <2 x i16> %result
@@ -781,32 +781,32 @@ define <2 x i16> @v_ashr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_ashr_v2i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 15, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v0
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v2i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result
@@ -954,7 +954,6 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
@@ -975,31 +974,32 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, v2, v0
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, v3, v1
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <4 x i16> %value, %amount
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -1115,7 +1115,6 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_ashr_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, v8, v0
@@ -1156,11 +1155,11 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v8, v4, v0
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, v5, v1
@@ -1173,24 +1172,25 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v3, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, v5, v1
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v2, v6, v2
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v3, v7, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <8 x i16> %value, %amount
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -1345,26 +1345,26 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 define i64 @v_ashr_i64(i64 %value, i64 %amount) {
 ; GFX6-LABEL: v_ashr_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i64 %value, %amount
   ret i64 %result
@@ -1373,16 +1373,16 @@ define i64 @v_ashr_i64(i64 %value, i64 %amount) {
 define i64 @v_ashr_i64_63(i64 %value) {
 ; GCN-LABEL: v_ashr_i64_63:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i64_63:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i64 %value, 63
   ret i64 %result
@@ -1391,17 +1391,17 @@ define i64 @v_ashr_i64_63(i64 %value) {
 define i64 @v_ashr_i64_33(i64 %value) {
 ; GCN-LABEL: v_ashr_i64_33:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 1, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i64_33:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 1, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i64 %value, 33
   ret i64 %result
@@ -1410,16 +1410,16 @@ define i64 @v_ashr_i64_33(i64 %value) {
 define i64 @v_ashr_i64_32(i64 %value) {
 ; GCN-LABEL: v_ashr_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i64_32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i64 %value, 32
   ret i64 %result
@@ -1428,26 +1428,26 @@ define i64 @v_ashr_i64_32(i64 %value) {
 define i64 @v_ashr_i64_31(i64 %value) {
 ; GFX6-LABEL: v_ashr_i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i64 %value, 31
   ret i64 %result
@@ -1583,30 +1583,30 @@ define amdgpu_ps <2 x float> @ashr_i64_vs(i64 %value, i64 inreg %amount) {
 define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX6-LABEL: v_ashr_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
 ; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
 ; GFX8-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i64> %value, %amount
   ret <2 x i64> %result
@@ -1615,30 +1615,30 @@ define <2 x i64> @v_ashr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 define <2 x i64> @v_ashr_v2i64_31(<2 x i64> %value) {
 ; GFX6-LABEL: v_ashr_v2i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_v2i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_ashrrev_i64 v[2:3], 31, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_v2i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 31, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_v2i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[2:3], 31, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr <2 x i64> %value, <i64 31, i64 31>
   ret <2 x i64> %result
@@ -1663,7 +1663,6 @@ define amdgpu_ps <2 x i64> @s_ashr_v2i64(<2 x i64> inreg %value, <2 x i64> inreg
 define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX6-LABEL: v_ashr_i65:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v3
@@ -1682,11 +1681,11 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v10, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i65:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v3
@@ -1705,11 +1704,11 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v10, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i65:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v3
@@ -1728,11 +1727,11 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v10, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ashr_i65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
@@ -1751,11 +1750,11 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ashr_i65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v4, v2, 0, 1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
@@ -1774,6 +1773,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, %amount
   ret i65 %result
@@ -1782,7 +1782,6 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) {
 define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_ashr_i65_33:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1790,11 +1789,11 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ashr_i65_33:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1802,11 +1801,11 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ashr_i65_33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1814,11 +1813,11 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ashr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
@@ -1826,6 +1825,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 604caf572b0fe8e..4a2cf35b384dc00 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -6,9 +6,9 @@ declare hidden ptr addrspace(1) @ext(ptr addrspace(1))
 define ptr addrspace(1) @call_assert_align() {
 ; CHECK-LABEL: call_assert_align:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
@@ -44,7 +44,6 @@ entry:
 define ptr addrspace(1) @tail_call_assert_align() {
 ; CHECK-LABEL: tail_call_assert_align:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
index 439ffbac960b8cf..c6950468aa375e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/br-constant-invalid-sgpr-copy.ll
@@ -8,24 +8,24 @@
 define void @br_false() {
 ; WAVE64-LABEL: br_false:
 ; WAVE64:       ; %bb.0: ; %.exit
-; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:  .LBB0_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    s_mov_b32 s4, 1
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB0_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-LABEL: br_false:
 ; WAVE32:       ; %bb.0: ; %.exit
-; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:  .LBB0_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    s_mov_b32 s4, 1
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB0_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:    s_setpc_b64 s[30:31]
 .exit:
   br label %bb0
@@ -40,24 +40,24 @@ bb0:
 define void @br_true() {
 ; WAVE64-LABEL: br_true:
 ; WAVE64:       ; %bb.0: ; %.exit
-; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:  .LBB1_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    s_mov_b32 s4, 0
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB1_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-LABEL: br_true:
 ; WAVE32:       ; %bb.0: ; %.exit
-; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:  .LBB1_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    s_mov_b32 s4, 0
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB1_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:    s_setpc_b64 s[30:31]
 .exit:
   br label %bb0
@@ -72,7 +72,6 @@ bb0:
 define void @br_undef() {
 ; WAVE64-LABEL: br_undef:
 ; WAVE64:       ; %bb.0: ; %.exit
-; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:  .LBB2_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    ; implicit-def: $sgpr4
@@ -80,11 +79,11 @@ define void @br_undef() {
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-LABEL: br_undef:
 ; WAVE32:       ; %bb.0: ; %.exit
-; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:  .LBB2_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    ; implicit-def: $sgpr4
@@ -92,6 +91,7 @@ define void @br_undef() {
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:    s_setpc_b64 s[30:31]
 .exit:
   br label %bb0
@@ -106,7 +106,6 @@ bb0:
 define void @br_poison() {
 ; WAVE64-LABEL: br_poison:
 ; WAVE64:       ; %bb.0: ; %.exit
-; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:  .LBB3_1: ; %bb0
 ; WAVE64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE64-NEXT:    ; implicit-def: $sgpr4
@@ -114,11 +113,11 @@ define void @br_poison() {
 ; WAVE64-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; WAVE64-NEXT:  ; %bb.2: ; %.exit5
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-LABEL: br_poison:
 ; WAVE32:       ; %bb.0: ; %.exit
-; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:  .LBB3_1: ; %bb0
 ; WAVE32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; WAVE32-NEXT:    ; implicit-def: $sgpr4
@@ -126,6 +125,7 @@ define void @br_poison() {
 ; WAVE32-NEXT:    s_cmp_lg_u32 s4, 0
 ; WAVE32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; WAVE32-NEXT:  ; %bb.2: ; %.exit5
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-NEXT:    s_setpc_b64 s[30:31]
 .exit:
   br label %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
index 132dc876b3b0545..99add13bd26b5e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
@@ -43,31 +43,31 @@ define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
 define i32 @v_bswap_i32(i32 %src) {
 ; GFX7-LABEL: v_bswap_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_alignbit_b32 v1, v0, v0, 8
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
 ; GFX7-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x10203
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i32 @llvm.bswap.i32(i32 %src)
   ret i32 %bswap
@@ -123,7 +123,6 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
 define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
 ; GFX7-LABEL: v_bswap_v2i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_alignbit_b32 v2, v0, v0, 8
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -131,29 +130,30 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
 ; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; GFX7-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
   ret <2 x i32> %bswap
@@ -209,7 +209,6 @@ define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
 define i64 @v_bswap_i64(i64 %src) {
 ; GFX7-LABEL: v_bswap_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -218,32 +217,33 @@ define i64 @v_bswap_i64(i64 %src) {
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; GFX7-NEXT:    v_bfi_b32 v1, s4, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX8-NEXT:    v_perm_b32 v2, 0, v1, s4
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX9-NEXT:    v_perm_b32 v2, 0, v1, s4
 ; GFX9-NEXT:    v_perm_b32 v1, 0, v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v2, 0, v1, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i64 @llvm.bswap.i64(i64 %src)
   ret i64 %bswap
@@ -323,7 +323,6 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
 define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
 ; GFX7-LABEL: v_bswap_v2i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_alignbit_b32 v4, v1, v1, 8
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -339,11 +338,11 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
 ; GFX7-NEXT:    v_bfi_b32 v3, s4, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX8-NEXT:    v_perm_b32 v4, 0, v1, s4
 ; GFX8-NEXT:    v_perm_b32 v5, 0, v3, s4
@@ -351,11 +350,11 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
 ; GFX8-NEXT:    v_perm_b32 v3, 0, v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX9-NEXT:    v_perm_b32 v4, 0, v1, s4
 ; GFX9-NEXT:    v_perm_b32 v5, 0, v3, s4
@@ -363,17 +362,18 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
 ; GFX9-NEXT:    v_perm_b32 v3, 0, v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v4, 0, v1, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v5, 0, v3, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v1, 0, v0, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v3, 0, v2, 0x10203
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
   ret <2 x i64> %bswap
@@ -415,30 +415,30 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
 define i16 @v_bswap_i16(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   ret i16 %bswap
@@ -488,31 +488,31 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
 define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   %zext = zext i16 %bswap to i32
@@ -522,34 +522,34 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 ; GFX7-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
 ; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   %zext = sext i16 %bswap to i32
@@ -559,33 +559,33 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; GFX7-LABEL: v_bswap_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
 ; GFX7-NEXT:    v_bfe_u32 v1, v1, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x2030001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2030001
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
   ret <2 x i16> %bswap
@@ -594,7 +594,6 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 ; GFX7-LABEL: v_bswap_v3i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v0
 ; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -604,29 +603,30 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
 ; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x2030001
 ; GFX8-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2030001
 ; GFX9-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
 ; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
   ret <3 x i16> %bswap
@@ -635,7 +635,6 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 define i64 @v_bswap_i48(i64 %src) {
 ; GFX7-LABEL: v_bswap_i48:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_alignbit_b32 v2, v1, v1, 8
 ; GFX7-NEXT:    v_alignbit_b32 v1, v1, v1, 24
 ; GFX7-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -644,32 +643,33 @@ define i64 @v_bswap_i48(i64 %src) {
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; GFX7-NEXT:    v_bfi_b32 v2, s4, v0, v2
 ; GFX7-NEXT:    v_lshr_b64 v[0:1], v[1:2], 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bswap_i48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX8-NEXT:    v_perm_b32 v1, 0, v1, s4
 ; GFX8-NEXT:    v_perm_b32 v2, 0, v0, s4
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_bswap_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10203
 ; GFX9-NEXT:    v_perm_b32 v1, 0, v1, s4
 ; GFX9-NEXT:    v_perm_b32 v2, 0, v0, s4
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bswap_i48:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v1, 0, v1, 0x10203
 ; GFX10-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i64 %src to i48
   %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
index 39423ea22cbb4d7..a877f5f3dd4f81e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bug-legalization-artifact-combiner-dead-def.ll
@@ -4,11 +4,11 @@
 define void @value_finder_bug(ptr addrspace(5) %store_ptr, ptr addrspace(4) %ptr) {
 ; GFX10-LABEL: value_finder_bug:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <4 x float>, ptr addrspace(4) %ptr, align 4
   %vec.3 = extractelement <4 x float> %vec, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index e597ce6f114a6b3..67a48a7351316ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -223,9 +223,9 @@ define amdgpu_kernel void @kernel_caller_byval() {
 define void @func_caller_stack() {
 ; MUBUF-LABEL: func_caller_stack:
 ; MUBUF:       ; %bb.0:
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s4, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_waitcnt expcnt(0)
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
@@ -258,9 +258,9 @@ define void @func_caller_stack() {
 ;
 ; FLATSCR-LABEL: func_caller_stack:
 ; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s0, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_waitcnt expcnt(0)
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
@@ -301,9 +301,9 @@ define void @func_caller_stack() {
 define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; MUBUF-LABEL: func_caller_byval:
 ; MUBUF:       ; %bb.0:
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s4, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_waitcnt expcnt(0)
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
@@ -383,9 +383,9 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ;
 ; FLATSCR-LABEL: func_caller_byval:
 ; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s0, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_waitcnt expcnt(0)
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index ea46f4d2d419e8f..ed0e44fcce471c5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -4,8 +4,8 @@
 define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
 ; GFX10-LABEL: test_fmed3_f32_known_nnan_ieee_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call nnan float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -15,8 +15,8 @@ define float @test_fmed3_f32_known_nnan_ieee_true(float %a) #0 {
 define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 {
 ; GFX10-LABEL: test_fmed3_f16_known_nnan_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
   %fmed = call nnan half @llvm.amdgcn.fmed3.f16(half %fmul, half 0.0, half 1.0)
@@ -27,9 +27,9 @@ define half @test_fmed3_f16_known_nnan_ieee_false(half %a) #1 {
 define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
 ; GFX10-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f32_e64 v0, 0x41200000, v0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmin = call float @llvm.minnum.f32(float %a, float 10.0)
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -40,8 +40,8 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
 define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true(float %a) #2 {
 ; GFX10-LABEL: test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -53,8 +53,8 @@ define float @test_fmed3_maybe_SNaN_input_zero_third_operand_ieee_true_dx10clamp
 define float @test_fmed3_global_nnan(float %a) #3 {
 ; GFX10-LABEL: test_fmed3_global_nnan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
@@ -69,9 +69,9 @@ define float @test_fmed3_global_nnan(float %a) #3 {
 define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
 ; GFX10-LABEL: test_fmed3_f32_maybe_NaN_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 1.0, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 1.0, float 0.0)
@@ -82,10 +82,10 @@ define float @test_fmed3_f32_maybe_NaN_ieee_false(float %a) #1 {
 define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 {
 ; GFX10-LABEL: test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 0x41200000, v0
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 0, 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmin = call float @llvm.minnum.f32(float %a, float 10.0)
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmin, float 0.0, float 1.0)
@@ -96,8 +96,8 @@ define float @test_fmed3_non_SNaN_input_ieee_true_dx10clamp_false(float %a) #4 {
 define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2 {
 ; GFX10-LABEL: test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 4f75d205cda35c3..036dc3ec68a4552 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -4,8 +4,8 @@
 define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 ; GFX10-LABEL: test_min_max_ValK0_K1_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %maxnum = call nnan float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -16,8 +16,8 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 define double @test_min_max_K0Val_K1_f64(double %a) #1 {
 ; GFX10-LABEL: test_min_max_K0Val_K1_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul double %a, 2.0
   %maxnum = call nnan double @llvm.maxnum.f64(double 0.0, double %fmul)
@@ -29,8 +29,8 @@ define double @test_min_max_K0Val_K1_f64(double %a) #1 {
 define half @test_min_K1max_ValK0_f16(half %a) #2 {
 ; GFX10-LABEL: test_min_K1max_ValK0_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
   %maxnum = call half @llvm.maxnum.f16(half %fmul, half 0.0)
@@ -41,8 +41,8 @@ define half @test_min_K1max_ValK0_f16(half %a) #2 {
 define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 {
 ; GFX10-LABEL: test_min_K1max_K0Val_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
   %maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half 0.0>, <2 x half> %fmul)
@@ -53,8 +53,8 @@ define <2 x half> @test_min_K1max_K0Val_f16(<2 x half> %a) #1 {
 define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
 ; GFX10-LABEL: test_min_max_splat_padded_with_undef:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
   %maxnum = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 0.0, half undef>, <2 x half> %fmul)
@@ -67,8 +67,8 @@ define <2 x half> @test_min_max_splat_padded_with_undef(<2 x half> %a) #2 {
 define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 ; GFX10-LABEL: test_max_min_ValK1_K0_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %minnum = call nnan float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -79,8 +79,8 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 define double @test_max_min_K1Val_K0_f64(double %a) #1 {
 ; GFX10-LABEL: test_max_min_K1Val_K0_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul double %a, 2.0
   %minnum = call nnan double @llvm.minnum.f64(double 1.0, double %fmul)
@@ -91,8 +91,8 @@ define double @test_max_min_K1Val_K0_f64(double %a) #1 {
 define half @test_max_K0min_ValK1_f16(half %a) #0 {
 ; GFX10-LABEL: test_max_K0min_ValK1_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e64 v0, v0, 2.0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul half %a, 2.0
   %minnum = call nnan half @llvm.minnum.f16(half %fmul, half 1.0)
@@ -104,8 +104,8 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
 define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
 ; GFX10-LABEL: test_max_K0min_K1Val_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, 2.0 op_sel_hi:[1,0] clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul <2 x half> %a, <half 2.0, half 2.0>
   %minnum = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> <half 1.0, half undef>, <2 x half> %fmul)
@@ -118,8 +118,8 @@ define <2 x half> @test_max_K0min_K1Val_v2f16(<2 x half> %a) #1 {
 define float @test_min_max_global_nnan(float %a) #3 {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 0.0)
   %fmed = call float @llvm.minnum.f32(float %maxnum, float 1.0)
@@ -129,8 +129,8 @@ define float @test_min_max_global_nnan(float %a) #3 {
 define float @test_max_min_global_nnan(float %a) #3 {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e64 v0, v0, v0 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 1.0)
   %fmed = call float @llvm.maxnum.f32(float %minnum, float 0.0)
@@ -145,9 +145,9 @@ define float @test_max_min_global_nnan(float %a) #3 {
 define float @test_min_max_K0_gt_K1(float %a) #0 {
 ; GFX10-LABEL: test_min_max_K0_gt_K1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 1.0)
   %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 0.0)
@@ -158,9 +158,9 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
 define float @test_max_min_K0_gt_K1(float %a) #0 {
 ; GFX10-LABEL: test_max_min_K0_gt_K1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 0.0)
   %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 1.0)
@@ -173,10 +173,10 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
 define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 ; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -188,9 +188,9 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #4 {
 ; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 0, 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %maxnum = call float @llvm.maxnum.f32(float %fmul, float 0.0)
@@ -203,10 +203,10 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
 define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
@@ -217,10 +217,10 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
 ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %minnum = call float @llvm.minnum.f32(float %fmul, float 1.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
index 3c9fae3efc30b1b..7fc2bfa0684be60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-fma-mul.ll
@@ -12,51 +12,51 @@
 define float @test_f32_add_mul(float %a, float %b, float %c, float %d, float %e) {
 ; GFX9-CONTRACT-LABEL: test_f32_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v3, v4
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f32_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f32_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast float %c, %d
@@ -68,51 +68,51 @@ define float @test_f32_add_mul(float %a, float %b, float %c, float %d, float %e)
 define float @test_f32_add_mul_rhs(float %a, float %b, float %c, float %d, float %e) {
 ; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v3, v4
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX10-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f32_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f32_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast float %c, %d
@@ -124,53 +124,53 @@ define float @test_f32_add_mul_rhs(float %a, float %b, float %c, float %d, float
 define half @test_half_add_mul(half %a, half %b, half %c, half %d, half %e) {
 ; GFX9-CONTRACT-LABEL: test_half_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v2, v2, v3, v4
 ; GFX9-DENORM-NEXT:    v_mac_f16_e32 v2, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_half_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_half_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast half %c, %d
@@ -182,53 +182,53 @@ define half @test_half_add_mul(half %a, half %b, half %c, half %d, half %e) {
 define half @test_half_add_mul_rhs(half %a, half %b, half %c, half %d, half %e) {
 ; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v2, v2, v3, v4
 ; GFX9-DENORM-NEXT:    v_mac_f16_e32 v2, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX10-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v4, v0
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_half_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f16 v2, v2, v3, v4
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-CONTRACT-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_half_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_add_f16_e32 v0, v4, v0
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast half %c, %d
@@ -240,46 +240,46 @@ define half @test_half_add_mul_rhs(half %a, half %b, half %c, half %d, half %e)
 define double @test_double_add_mul(double %a, double %b, double %c, double %d, double %e) {
 ; GFX9-CONTRACT-LABEL: test_double_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_double_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_double_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast double %c, %d
@@ -291,46 +291,46 @@ define double @test_double_add_mul(double %a, double %b, double %c, double %d, d
 define double @test_double_add_mul_rhs(double %a, double %b, double %c, double %d, double %e) {
 ; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_double_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_double_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast double %c, %d
@@ -342,7 +342,6 @@ define double @test_double_add_mul_rhs(double %a, double %b, double %c, double %
 define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e) {
 ; GFX9-CONTRACT-LABEL: test_v4f32_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -351,11 +350,11 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f32_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v8, v8, v12, v16
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v8, v0, v4
 ; GFX9-DENORM-NEXT:    v_mad_f32 v4, v9, v13, v17
@@ -368,11 +367,11 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f32_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -385,11 +384,11 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f32_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX10-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX10-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -402,11 +401,11 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f32_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -417,11 +416,11 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f32_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX11-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX11-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -432,6 +431,7 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GFX11-DENORM-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x float> %c, %d
@@ -443,7 +443,6 @@ define <4 x float> @test_v4f32_add_mul(<4 x float> %a, <4 x float> %b, <4 x floa
 define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e) {
 ; GFX9-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -452,11 +451,11 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f32_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v8, v8, v12, v16
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v8, v0, v4
 ; GFX9-DENORM-NEXT:    v_mad_f32 v4, v9, v13, v17
@@ -469,11 +468,11 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v2, v5
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -486,11 +485,11 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f32_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX10-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX10-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -503,11 +502,11 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v2, v10
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v3, v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f32_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -518,11 +517,11 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GFX11-CONTRACT-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f32_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_fma_f32 v8, v8, v12, v16
 ; GFX11-DENORM-NEXT:    v_fma_f32 v9, v9, v13, v17
 ; GFX11-DENORM-NEXT:    v_fma_f32 v10, v10, v14, v18
@@ -533,6 +532,7 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GFX11-DENORM-NEXT:    v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x float> %c, %d
@@ -544,16 +544,15 @@ define <4 x float> @test_v4f32_add_mul_rhs(<4 x float> %a, <4 x float> %b, <4 x
 define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) {
 ; GFX9-CONTRACT-LABEL: test_f16_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f16_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -562,20 +561,20 @@ define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c,
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f16_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f16_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -584,21 +583,21 @@ define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c,
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f16_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f16_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -609,6 +608,7 @@ define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c,
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v0, v8
 ; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v1, v9
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x half> %c, %d
@@ -620,16 +620,15 @@ define <4 x half> @test_f16_add_mul(<4 x half> %a, <4 x half> %b, <4 x half> %c,
 define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) {
 ; GFX9-CONTRACT-LABEL: test_f16_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f16_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -638,20 +637,20 @@ define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half>
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f16_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f16_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -660,21 +659,21 @@ define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half>
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f16_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v4, v4, v6, v8
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v5, v5, v7, v9
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f16_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v4, v4, v6
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v5, v5, v7
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
@@ -685,6 +684,7 @@ define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half>
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_pk_add_f16 v0, v8, v0
 ; GFX11-DENORM-NEXT:    v_pk_add_f16 v1, v9, v1
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x half> %c, %d
@@ -696,7 +696,6 @@ define <4 x half> @test_f16_add_mul_rhs(<4 x half> %a, <4 x half> %b, <4 x half>
 define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e) {
 ; GFX9-CONTRACT-LABEL: test_f64_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
@@ -718,11 +717,11 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f64_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
@@ -744,11 +743,11 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f64_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_clause 0x8
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
@@ -771,11 +770,11 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f64_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x8
 ; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
@@ -798,11 +797,11 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f64_add_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_clause 0x8
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -827,11 +826,11 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f64_add_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_clause 0x8
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -856,6 +855,7 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x double> %c, %d
@@ -867,7 +867,6 @@ define <4 x double> @test_f64_add_mul(<4 x double> %a, <4 x double> %b, <4 x dou
 define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e) {
 ; GFX9-CONTRACT-LABEL: test_f64_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
@@ -889,11 +888,11 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f64_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; GFX9-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
@@ -915,11 +914,11 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[22:23], v[22:23], v[30:31], v[24:25]
 ; GFX9-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f64_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_clause 0x8
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-CONTRACT-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
@@ -942,11 +941,11 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f64_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_clause 0x8
 ; GFX10-DENORM-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-DENORM-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
@@ -969,11 +968,11 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f64_add_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_clause 0x8
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-CONTRACT-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -998,11 +997,11 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX11-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f64_add_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_clause 0x8
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-DENORM-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -1027,6 +1026,7 @@ define <4 x double> @test_f64_add_mul_rhs(<4 x double> %a, <4 x double> %b, <4 x
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX11-DENORM-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %x = fmul fast <4 x double> %c, %d
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 69346de9bb79805..dbbecb10ae9cf87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -11,52 +11,52 @@
 define float @test_f32_add_mul(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f32_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_f32_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f32_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_f32_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -67,52 +67,52 @@ define float @test_f32_add_mul(float %x, float %y, float %z) {
 define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f32_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -123,72 +123,72 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) {
 define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
 ; GFX9-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v3, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v3, v0, v1
 ; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -201,72 +201,72 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
 define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) {
 ; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
 ; GFX9-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fmac_f32_e32 v3, v0, v1
 ; GFX10-CONTRACT-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mac_f32_e32 v3, v0, v1
 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
 ; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fmac_f32_e32 v3, v0, v1
 ; GFX10-UNSAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -279,53 +279,53 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
 define half @test_half_add_mul(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_half_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_half_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_half_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_half_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_half_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul half %x, %y
@@ -336,53 +336,53 @@ define half @test_half_add_mul(half %x, half %y, half %z) {
 define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_half_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_half_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul half %x, %y
@@ -393,54 +393,54 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) {
 define double @test_double_add_mul(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_double_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_double_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_double_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_double_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_double_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul double %x, %y
@@ -451,54 +451,54 @@ define double @test_double_add_mul(double %x, double %y, double %z) {
 define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_double_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_double_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], v[0:1]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul double %x, %y
@@ -509,7 +509,6 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) {
 define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; GFX9-LABEL: test_4xfloat_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -518,38 +517,38 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_4xfloat_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v4, v8
 ; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, v5, v9
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v6, v10
 ; GFX9-DENORM-NEXT:    v_mad_f32 v3, v3, v7, v11
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_4xfloat_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -558,33 +557,34 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_4xfloat_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v4, v8
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, v1, v5, v9
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, v6, v10
 ; GFX10-DENORM-NEXT:    v_mad_f32 v3, v3, v7, v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x float> %x, %y
@@ -595,72 +595,72 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl
 define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
 ; GFX9-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    v_add_f32_e32 v0, v6, v0
 ; GFX9-NEXT:    v_add_f32_e32 v1, v7, v1
 ; GFX9-NEXT:    v_add_f32_e32 v2, v8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v3, v6
 ; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, v4, v7
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v5, v8
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GFX9-UNSAFE-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v5
 ; GFX10-NEXT:    v_add_f32_e32 v0, v6, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v7, v1
 ; GFX10-NEXT:    v_add_f32_e32 v2, v8, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v3, v6
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, v1, v4, v7
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, v5, v8
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GFX10-UNSAFE-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <3 x float> %x, %y
@@ -671,66 +671,66 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3
 define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX9-LABEL: test_4xhalf_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_4xhalf_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_4xhalf_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX10-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_4xhalf_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v1, v5
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x half> %x, %y
@@ -741,66 +741,66 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half>
 define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX9-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX9-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX10-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v0, v4, v0
 ; GFX10-DENORM-NEXT:    v_pk_add_f16 v1, v5, v1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-UNSAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <3 x half> %x, %y
@@ -811,7 +811,6 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
 define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
 ; GFX9-LABEL: test_4xdouble_add_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -820,20 +819,20 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], v[18:19]
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], v[20:21]
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_4xdouble_add_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -842,20 +841,20 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX9-DENORM-NEXT:    v_add_f64 v[2:3], v[2:3], v[18:19]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], v[20:21]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_4xdouble_add_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -864,20 +863,20 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[18:19]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], v[20:21]
 ; GFX10-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_4xdouble_add_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -886,15 +885,16 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], v[2:3], v[18:19]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], v[20:21]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], v[22:23]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_4xdouble_add_mul:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x double> %x, %y
@@ -905,78 +905,78 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4
 define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, <3 x double> %z) {
 ; GFX9-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GFX9-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[12:13], v[0:1]
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[14:15], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[12:13], v[0:1]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[2:3], v[14:15], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX9-UNSAFE:       ; %bb.0: ; %.entry
-; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GFX9-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[12:13], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[14:15], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[12:13], v[0:1]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], v[14:15], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[16:17], v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-UNSAFE-LABEL: test_3xdouble_add_mul_rhs:
 ; GFX10-UNSAFE:       ; %bb.0: ; %.entry
-; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GFX10-UNSAFE-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <3 x double> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 240a7a8f30204b5..034ec7d4220afbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -14,54 +14,54 @@
 define float @test_f32_sub_mul(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f32_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f32_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f32_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f32_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -72,54 +72,54 @@ define float @test_f32_sub_mul(float %x, float %y, float %z) {
 define float @test_f32_sub_mul_rhs(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f32_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f32_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_f32_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_f32_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_sub_f32_e32 v0, v2, v0
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul float %x, %y
@@ -130,55 +130,55 @@ define float @test_f32_sub_mul_rhs(float %x, float %y, float %z) {
 define half @test_half_sub_mul(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_half_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_half_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, v1, -v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_half_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_half_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_half_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul half %x, %y
@@ -189,55 +189,55 @@ define half @test_half_sub_mul(half %x, half %y, half %z) {
 define half @test_half_sub_mul_rhs(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_half_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_half_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_half_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, -v0, v1, v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_half_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_half_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_half_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_half_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_half_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul half %x, %y
@@ -248,56 +248,56 @@ define half @test_half_sub_mul_rhs(half %x, half %y, half %z) {
 define double @test_double_sub_mul(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_double_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_double_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_double_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_double_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_double_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul double %x, %y
@@ -308,56 +308,56 @@ define double @test_double_sub_mul(double %x, double %y, double %z) {
 define double @test_double_sub_mul_rhs(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_double_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_double_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_double_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_double_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_double_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_double_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_double_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_double_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_add_f64 v[0:1], v[4:5], -v[0:1]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul double %x, %y
@@ -368,7 +368,6 @@ define double @test_double_sub_mul_rhs(double %x, double %y, double %z) {
 define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; GFX9-LABEL: test_v4f32_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -377,29 +376,29 @@ define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x floa
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v3, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f32_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, -v8
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, -v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, -v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, -v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f32_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, v4, -v8
 ; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, v5, -v9
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, v6, -v10
 ; GFX9-DENORM-NEXT:    v_mad_f32 v3, v3, v7, -v11
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f32_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -408,43 +407,44 @@ define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x floa
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f32_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, -v8
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, -v9
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, -v10
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, -v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f32_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, v4, -v8
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, v1, v5, -v9
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, v6, -v10
 ; GFX10-DENORM-NEXT:    v_mad_f32 v3, v3, v7, -v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f32_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v0, v0, v4, -v8
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v1, v1, v5, -v9
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, v2, v6, -v10
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v3, v3, v7, -v11
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f32_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
 ; GFX11-DENORM-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_mul_f32 v3, v3, v7
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_dual_sub_f32 v0, v0, v8 :: v_dual_sub_f32 v1, v1, v9
 ; GFX11-DENORM-NEXT:    v_dual_sub_f32 v2, v2, v10 :: v_dual_sub_f32 v3, v3, v11
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x float> %x, %y
@@ -455,7 +455,6 @@ define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x floa
 define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; GFX9-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -464,29 +463,29 @@ define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v9, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v10, v2
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v11, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v4, v8
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, -v1, v5, v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, -v2, v6, v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, -v3, v7, v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, -v0, v4, v8
 ; GFX9-DENORM-NEXT:    v_mad_f32 v1, -v1, v5, v9
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, -v2, v6, v10
 ; GFX9-DENORM-NEXT:    v_mad_f32 v3, -v3, v7, v11
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
@@ -495,43 +494,44 @@ define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v9, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v10, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v3, v11, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v4, v8
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, -v1, v5, v9
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, -v2, v6, v10
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v3, -v3, v7, v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, -v0, v4, v8
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, -v1, v5, v9
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, -v2, v6, v10
 ; GFX10-DENORM-NEXT:    v_mad_f32 v3, -v3, v7, v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v0, -v0, v4, v8
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v1, -v1, v5, v9
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v2, -v2, v6, v10
 ; GFX11-CONTRACT-NEXT:    v_fma_f32 v3, -v3, v7, v11
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f32_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5
 ; GFX11-DENORM-NEXT:    v_dual_mul_f32 v2, v2, v6 :: v_dual_mul_f32 v3, v3, v7
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_dual_sub_f32 v0, v8, v0 :: v_dual_sub_f32 v1, v9, v1
 ; GFX11-DENORM-NEXT:    v_dual_sub_f32 v2, v10, v2 :: v_dual_sub_f32 v3, v11, v3
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x float> %x, %y
@@ -542,7 +542,6 @@ define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x
 define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX9-LABEL: test_v4f16_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -551,18 +550,18 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f16_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -571,11 +570,11 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -584,18 +583,18 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -604,18 +603,18 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f16_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX11-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
@@ -631,6 +630,7 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x half> %x, %y
@@ -641,7 +641,6 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX9-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-NEXT:    v_sub_f16_e32 v2, v4, v0
@@ -650,18 +649,18 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
@@ -670,11 +669,11 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v4, v0
@@ -683,18 +682,18 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
@@ -703,18 +702,18 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX11-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX11-DENORM-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
@@ -730,6 +729,7 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DENORM-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX11-DENORM-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x half> %x, %y
@@ -740,7 +740,6 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
 ; GFX9-LABEL: test_v4f64_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -749,20 +748,20 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], -v[18:19]
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], -v[20:21]
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], -v[22:23]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f64_sub_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f64_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -771,11 +770,11 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 ; GFX9-DENORM-NEXT:    v_add_f64 v[2:3], v[2:3], -v[18:19]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], -v[20:21]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], -v[22:23]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f64_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -784,20 +783,20 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[18:19]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[20:21]
 ; GFX10-NEXT:    v_add_f64 v[6:7], v[6:7], -v[22:23]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f64_sub_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f64_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -806,20 +805,20 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], v[2:3], -v[18:19]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], -v[20:21]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], -v[22:23]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f64_sub_mul:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f64_sub_mul:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -830,6 +829,7 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DENORM-NEXT:    v_add_f64 v[4:5], v[4:5], -v[20:21]
 ; GFX11-DENORM-NEXT:    v_add_f64 v[6:7], v[6:7], -v[22:23]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x double> %x, %y
@@ -840,7 +840,6 @@ define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x d
 define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
 ; GFX9-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -849,20 +848,20 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[18:19], -v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[20:21], -v[4:5]
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[22:23], -v[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX9-CONTRACT:       ; %bb.0: ; %.entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -871,11 +870,11 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4
 ; GFX9-DENORM-NEXT:    v_add_f64 v[2:3], v[18:19], -v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], v[20:21], -v[4:5]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[6:7], v[22:23], -v[6:7]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -884,20 +883,20 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[18:19], -v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[20:21], -v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[6:7], v[22:23], -v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX10-CONTRACT:       ; %bb.0: ; %.entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -906,20 +905,20 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], v[18:19], -v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], v[20:21], -v[4:5]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[6:7], v[22:23], -v[6:7]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX11-CONTRACT:       ; %bb.0: ; %.entry
-; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21]
 ; GFX11-CONTRACT-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23]
+; GFX11-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-DENORM-LABEL: test_v4f64_sub_mul_rhs:
 ; GFX11-DENORM:       ; %bb.0: ; %.entry
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX11-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -930,6 +929,7 @@ define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DENORM-NEXT:    v_add_f64 v[4:5], v[20:21], -v[4:5]
 ; GFX11-DENORM-NEXT:    v_add_f64 v[6:7], v[22:23], -v[6:7]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a = fmul <4 x double> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 113f34d671502af..4cb129dd4228b36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -10,40 +10,40 @@
 define float @test_f32_sub_ext_neg_mul(float %x, float %y, float %z) {
 ; GFX9-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e64 v0, v0, -v1
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, -v1, -v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, -v1
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f32_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, -v1, -v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul float %x, %y
@@ -55,41 +55,41 @@ entry:
 define half @test_f16_sub_ext_neg_mul(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_legacy_f16 v0, v0, -v1, -v2
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX10-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f16_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v0, v2
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul half %x, %y
@@ -101,42 +101,42 @@ entry:
 define double @test_f64_sub_ext_neg_mul(double %x, double %y, double %z) {
 ; GFX9-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[4:5]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_f64_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[4:5]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul double %x, %y
@@ -149,7 +149,6 @@ entry:
 define <4 x float> @test_v4f32_sub_ext_neg_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
 ; GFX9-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e64 v0, v0, -v4
 ; GFX9-NEXT:    v_mul_f32_e64 v1, v1, -v5
 ; GFX9-NEXT:    v_mul_f32_e64 v2, v2, -v6
@@ -158,29 +157,29 @@ define <4 x float> @test_v4f32_sub_ext_neg_mul(<4 x float> %x, <4 x float> %y, <
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v3, v11
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v0, v0, -v4, -v8
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v1, v1, -v5, -v9
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v2, v2, -v6, -v10
 ; GFX9-CONTRACT-NEXT:    v_fma_f32 v3, v3, -v7, -v11
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mad_f32 v0, v0, -v4, -v8
 ; GFX9-DENORM-NEXT:    v_mad_f32 v1, v1, -v5, -v9
 ; GFX9-DENORM-NEXT:    v_mad_f32 v2, v2, -v6, -v10
 ; GFX9-DENORM-NEXT:    v_mad_f32 v3, v3, -v7, -v11
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e64 v0, v0, -v4
 ; GFX10-NEXT:    v_mul_f32_e64 v1, v1, -v5
 ; GFX10-NEXT:    v_mul_f32_e64 v2, v2, -v6
@@ -189,24 +188,25 @@ define <4 x float> @test_v4f32_sub_ext_neg_mul(<4 x float> %x, <4 x float> %y, <
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_sub_f32_e32 v3, v3, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v0, v0, -v4, -v8
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v1, v1, -v5, -v9
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v2, v2, -v6, -v10
 ; GFX10-CONTRACT-NEXT:    v_fma_f32 v3, v3, -v7, -v11
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f32_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mad_f32 v0, v0, -v4, -v8
 ; GFX10-DENORM-NEXT:    v_mad_f32 v1, v1, -v5, -v9
 ; GFX10-DENORM-NEXT:    v_mad_f32 v2, v2, -v6, -v10
 ; GFX10-DENORM-NEXT:    v_mad_f32 v3, v3, -v7, -v11
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x float> %x, %y
@@ -218,7 +218,6 @@ entry:
 define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX9-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -227,18 +226,18 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
 ; GFX9-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -247,11 +246,11 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -260,18 +259,18 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
 ; GFX10-CONTRACT-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
@@ -280,6 +279,7 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x half> %x, %y
@@ -291,7 +291,6 @@ entry:
 define <4 x double> @test_v4f64_sub_ext_neg_mul(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
 ; GFX9-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -300,20 +299,20 @@ define <4 x double> @test_v4f64_sub_ext_neg_mul(<4 x double> %x, <4 x double> %y
 ; GFX9-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[18:19]
 ; GFX9-NEXT:    v_add_f64 v[4:5], -v[4:5], -v[20:21]
 ; GFX9-NEXT:    v_add_f64 v[6:7], -v[6:7], -v[22:23]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX9-CONTRACT:       ; %bb.0: ; %entry
-; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], -v[16:17]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], -v[18:19]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[12:13], -v[20:21]
 ; GFX9-CONTRACT-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], -v[22:23]
+; GFX9-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-DENORM-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
-; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX9-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -322,11 +321,11 @@ define <4 x double> @test_v4f64_sub_ext_neg_mul(<4 x double> %x, <4 x double> %y
 ; GFX9-DENORM-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[18:19]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[4:5], -v[4:5], -v[20:21]
 ; GFX9-DENORM-NEXT:    v_add_f64 v[6:7], -v[6:7], -v[22:23]
+; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -335,20 +334,20 @@ define <4 x double> @test_v4f64_sub_ext_neg_mul(<4 x double> %x, <4 x double> %y
 ; GFX10-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[18:19]
 ; GFX10-NEXT:    v_add_f64 v[4:5], -v[4:5], -v[20:21]
 ; GFX10-NEXT:    v_add_f64 v[6:7], -v[6:7], -v[22:23]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX10-CONTRACT:       ; %bb.0: ; %entry
-; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[8:9], -v[16:17]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[10:11], -v[18:19]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[12:13], -v[20:21]
 ; GFX10-CONTRACT-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[14:15], -v[22:23]
+; GFX10-CONTRACT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-CONTRACT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-DENORM-LABEL: test_v4f64_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
-; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[2:3], v[2:3], v[10:11]
 ; GFX10-DENORM-NEXT:    v_mul_f64 v[4:5], v[4:5], v[12:13]
@@ -357,6 +356,7 @@ define <4 x double> @test_v4f64_sub_ext_neg_mul(<4 x double> %x, <4 x double> %y
 ; GFX10-DENORM-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[18:19]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[4:5], -v[4:5], -v[20:21]
 ; GFX10-DENORM-NEXT:    v_add_f64 v[6:7], -v[6:7], -v[22:23]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x double> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44a18..a9eb71fa3bc7dd3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -8,15 +8,15 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i32_to_f32_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i32_to_f32_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to float
@@ -26,15 +26,15 @@ define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_sitofp_i32_to_f32_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_sitofp_i32_to_f32_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = sitofp i32 %masked to float
@@ -44,9 +44,9 @@ define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 7, 8
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.7 = lshr i32 %arg0, 7
   %masked = and i32 %lshr.7, 255
@@ -57,15 +57,15 @@ define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -76,7 +76,6 @@ define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -84,16 +83,15 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   store i32 %lshr.8, ptr addrspace(1) undef
@@ -105,15 +103,15 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -124,8 +122,8 @@ define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -136,15 +134,15 @@ define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i8_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i8_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to float
   ret float %cvt
@@ -153,20 +151,20 @@ define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_v2i8_to_v2f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v1
 ; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_v2i8_to_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %val = bitcast i16 %arg0 to <2 x i8>
   %cvt = uitofp <2 x i8> %val to <2 x float>
@@ -176,7 +174,6 @@ define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_v3i8_to_v3f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
@@ -184,15 +181,16 @@ define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_v3i8_to_v3f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i32 %arg0 to i24
   %val = bitcast i24 %trunc to <3 x i8>
@@ -203,7 +201,6 @@ define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_v4i8_to_v4f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
@@ -212,16 +209,17 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_v4i8_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %val = bitcast i32 %arg0 to <4 x i8>
   %cvt = uitofp <4 x i8> %val to <4 x float>
@@ -231,7 +229,6 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
@@ -240,16 +237,17 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %mask.arg0 = and i32 %arg0, 255
   %cvt0 = uitofp i32 %mask.arg0 to float
@@ -276,17 +274,17 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to half
@@ -296,17 +294,17 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = sitofp i32 %masked to half
@@ -316,17 +314,17 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -337,17 +335,17 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -358,9 +356,9 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -371,17 +369,17 @@ define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i8_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i8_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to half
   ret half %cvt
@@ -390,9 +388,9 @@ define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to double
@@ -402,9 +400,9 @@ define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -415,9 +413,9 @@ define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -428,9 +426,9 @@ define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -441,9 +439,9 @@ define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_i8_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to double
   ret double %cvt
@@ -1432,7 +1430,6 @@ bb:
 define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; SI-NEXT:    v_ffbh_i32_e32 v3, 0
@@ -1446,11 +1443,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; VI-NEXT:    v_ffbh_i32_e32 v3, 0
@@ -1464,6 +1461,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i64 %arg0, 255
   %itofp = sitofp i64 %masked to float
@@ -1473,7 +1471,6 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
 define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_ffbh_u32_e32 v2, 0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1484,11 +1481,11 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_ffbh_u32_e32 v2, 0
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
@@ -1499,6 +1496,7 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i64 %arg0, 255
   %itofp = uitofp i64 %masked to float
@@ -1508,17 +1506,17 @@ define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
 define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
 ; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i16 %arg0, 255
   %itofp = sitofp i16 %masked to float
@@ -1528,18 +1526,18 @@ define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
 define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
 ; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i16 %arg0, 255
   %itofp = uitofp i16 %masked to float
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 25e2267fdee89e2..f66d73cfc547511 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -6,9 +6,9 @@
 define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-LABEL: divergent_if_swap_brtarget_order0:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
@@ -16,7 +16,7 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB0_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -34,9 +34,9 @@ endif:
 define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-LABEL: divergent_if_swap_brtarget_order1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
@@ -44,7 +44,7 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB1_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = icmp ne i32 %value, 0
@@ -63,10 +63,10 @@ if.true:
 define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-LABEL: divergent_if_nonboolean_condition0:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
@@ -74,7 +74,7 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB2_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %c = trunc i32 %value to i1
@@ -93,12 +93,12 @@ endif:
 define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: divergent_if_nonboolean_condition1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dword v0, v[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB3_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.true
@@ -106,7 +106,7 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:  .LBB3_2: ; %endif
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %value = load i32, ptr addrspace(1) %ptr
@@ -130,7 +130,6 @@ endif:
 define void @constrained_if_register_class() {
 ; CHECK-LABEL: constrained_if_register_class:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, external_constant at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, external_constant at gotpcrel32@hi+12
@@ -141,6 +140,7 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: ; %bb12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB4_2: ; %bb2
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
@@ -164,7 +164,7 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:  ; %bb.5: ; %bb11
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4.0
 ; CHECK-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = load i32, ptr addrspace(4) @external_constant
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 96db1f889690df6..676afd7727c6c00 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -63,7 +63,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
 define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -83,12 +82,11 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -108,11 +106,11 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -133,6 +131,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_add_u32 s0, s32, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 4
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -198,7 +197,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
 define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
@@ -218,12 +216,11 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
@@ -243,11 +240,11 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
@@ -268,6 +265,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_add_u32 s0, s32, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 16
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -336,7 +334,6 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
 define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
@@ -358,12 +355,11 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf000
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x800
@@ -385,11 +381,11 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xfffffc00
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 31
 ; GFX11-NEXT:    s_add_i32 s32, s32, 64
@@ -412,6 +408,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffc0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xfffffc00
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv
   %alloca = alloca i32, i32 %n, align 32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 33a4d3c5494f7c9..86e86bbee59c3e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -7,7 +7,6 @@
 define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v64i32_varidx:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
@@ -172,7 +171,7 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 %idx
@@ -182,7 +181,6 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v128i16_varidx:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
@@ -351,7 +349,7 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(16)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <128 x i16>, ptr addrspace(1) %ptr
   %elt = extractelement <128 x i16> %vec, i32 %idx
@@ -361,7 +359,6 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
@@ -527,7 +524,7 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, ptr addrspace(1) %ptr
   %elt = extractelement <32 x i64> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 786d65f7dcc40db..eaa082b89adaa39 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -157,7 +157,6 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
 define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 1, v2
@@ -210,11 +209,11 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
@@ -273,11 +272,11 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v17
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v15, vcc
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s10, 0
 ; GFX7-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
@@ -333,11 +332,11 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v3, v14, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v15, vcc
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
@@ -391,11 +390,11 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
@@ -450,6 +449,7 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, v14, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 %idx
@@ -921,40 +921,35 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %p
 define i128 @extractelement_vgpr_v4i128_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 0
@@ -964,42 +959,37 @@ define i128 @extractelement_vgpr_v4i128_idx0(ptr addrspace(1) %ptr) {
 define i128 @extractelement_vgpr_v4i128_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 1
@@ -1009,42 +999,37 @@ define i128 @extractelement_vgpr_v4i128_idx1(ptr addrspace(1) %ptr) {
 define i128 @extractelement_vgpr_v4i128_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:32
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 2
@@ -1054,42 +1039,37 @@ define i128 @extractelement_vgpr_v4i128_idx2(ptr addrspace(1) %ptr) {
 define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:48
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:48
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i128>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i128> %vector, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 7028d1157787fe9..be8519fae0df230 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -126,7 +126,6 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
 define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -135,11 +134,11 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -148,11 +147,11 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -164,11 +163,11 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -177,11 +176,11 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -191,6 +190,7 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 %idx
@@ -351,40 +351,35 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(ptr addrspace(4) inreg %ptr
 define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 0
@@ -394,45 +389,45 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 1
@@ -442,45 +437,45 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 2
@@ -490,45 +485,45 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i16_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i16> %vector, i32 3
@@ -689,7 +684,6 @@ define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr,
 define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -702,11 +696,11 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -719,11 +713,11 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -739,11 +733,11 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -756,11 +750,11 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -775,6 +769,7 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 %idx
@@ -1055,40 +1050,35 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(ptr addrspace(4) inreg %ptr
 define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 0
@@ -1098,45 +1088,45 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 1
@@ -1146,45 +1136,45 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 2
@@ -1194,45 +1184,45 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 3
@@ -1242,45 +1232,45 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 4
@@ -1290,45 +1280,45 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 5
@@ -1338,45 +1328,45 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 6
@@ -1386,45 +1376,45 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
 define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i16_idx7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i16>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i16> %vector, i32 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c7f49d526fac06a..832a74a6e902b13 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -100,27 +100,26 @@ define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i3
 define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -129,27 +128,28 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    v_and_b32_e32 v1, 3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 %idx
@@ -317,40 +317,35 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(ptr addrspace(4) inreg %ptr)
 define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 0
@@ -360,45 +355,45 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 1
@@ -408,45 +403,45 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 2
@@ -456,45 +451,45 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v4i8_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <4 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <4 x i8> %vector, i32 3
@@ -622,7 +617,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i3
 define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -631,11 +625,11 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -644,11 +638,11 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -660,11 +654,11 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -673,11 +667,11 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -687,6 +681,7 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 %idx
@@ -951,40 +946,35 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(ptr addrspace(4) inreg %ptr)
 define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 0
@@ -994,45 +984,45 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 1
@@ -1042,45 +1032,45 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 2
@@ -1090,45 +1080,45 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 3
@@ -1138,45 +1128,45 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 4
@@ -1186,45 +1176,45 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 5
@@ -1234,45 +1224,45 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 6
@@ -1282,45 +1272,45 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v8i8_idx7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <8 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <8 x i8> %vector, i32 7
@@ -1481,7 +1471,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i
 define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -1494,11 +1483,11 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -1511,11 +1500,11 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1531,11 +1520,11 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -1548,11 +1537,11 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[3:6], v[0:1], off
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
@@ -1567,6 +1556,7 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v6, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 %idx
@@ -1642,40 +1632,35 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %
 define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 0
@@ -1685,45 +1670,45 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 1
@@ -1733,45 +1718,45 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 2
@@ -1781,45 +1766,45 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 3
@@ -1829,45 +1814,45 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 4
@@ -1877,45 +1862,45 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 5
@@ -1925,45 +1910,45 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 6
@@ -1973,45 +1958,45 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 7
@@ -2021,45 +2006,45 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 8
@@ -2069,45 +2054,45 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx9:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 9
@@ -2117,45 +2102,45 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 10
@@ -2165,45 +2150,45 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx11:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 11
@@ -2213,45 +2198,45 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx12:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 12
@@ -2261,45 +2246,45 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx13:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 13
@@ -2309,45 +2294,45 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx14:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 14
@@ -2357,45 +2342,45 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
 define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: extractelement_vgpr_v16i8_idx15:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vector = load <16 x i8>, ptr addrspace(1) %ptr
   %element = extractelement <16 x i8> %vector, i32 15
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8bf34caea40513d..f15528b5bd6b390 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -7,7 +7,6 @@
 define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, 2.0, vcc
@@ -27,11 +26,11 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v8f32_const_s_v:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
@@ -46,6 +45,7 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v1, 0x41000000, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
@@ -159,7 +159,6 @@ entry:
 define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8f32_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
@@ -174,11 +173,11 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v8f32_v_v:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
@@ -193,6 +192,7 @@ define float @dyn_extract_v8f32_v_v(<8 x float> %vec, i32 %sel) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x float> %vec, i32 %sel
@@ -291,7 +291,6 @@ entry:
 define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8i64_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
@@ -337,11 +336,11 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v15, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v16, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v8i64_const_s_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b64 s[6:7], 2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
@@ -373,11 +372,11 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s18, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s19, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v8i64_const_s_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b64 s[2:3], 2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
@@ -408,6 +407,7 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s14, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, s15, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i64> <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>, i32 %sel
@@ -695,7 +695,6 @@ entry:
 define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v8i64_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -717,11 +716,11 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v8i64_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -743,11 +742,11 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v8i64_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v16
@@ -762,6 +761,7 @@ define i64 @dyn_extract_v8i64_v_v(<8 x i64> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x i64> %vec, i32 %sel
@@ -976,7 +976,6 @@ entry:
 define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    v_add_u32_e32 v8, 3, v8
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -992,11 +991,11 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -1012,11 +1011,11 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v8f32_v_v_offset3:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e32 v8, 3, v8
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -1032,6 +1031,7 @@ define float @dyn_extract_v8f32_v_v_offset3(<8 x float> %vec, i32 %sel) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
@@ -1465,7 +1465,6 @@ entry:
 define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    v_add_u32_e32 v16, 3, v16
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1488,11 +1487,11 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v16, vcc, 3, v16
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -1515,11 +1514,11 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v16, 3, v16
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -1542,11 +1541,11 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v8f64_v_v_offset3:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v16
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2
@@ -1562,6 +1561,7 @@ define double @dyn_extract_v8f64_v_v_offset3(<8 x double> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v13 :: v_dual_cndmask_b32 v0, v0, v12
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v15 :: v_dual_cndmask_b32 v0, v0, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
@@ -1572,7 +1572,6 @@ entry:
 define ptr addrspace(3) @dyn_extract_v8p3_v_v(<8 x ptr addrspace(3)> %vec, i32 %idx) {
 ; GCN-LABEL: dyn_extract_v8p3_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v8
@@ -1587,11 +1586,11 @@ define ptr addrspace(3) @dyn_extract_v8p3_v_v(<8 x ptr addrspace(3)> %vec, i32 %
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v8p3_v_v:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v8
@@ -1606,6 +1605,7 @@ define ptr addrspace(3) @dyn_extract_v8p3_v_v(<8 x ptr addrspace(3)> %vec, i32 %
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v8
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x ptr addrspace(3)> %vec, i32 %idx
@@ -1690,7 +1690,6 @@ entry:
 define ptr addrspace(1) @dyn_extract_v8p1_v_v(<8 x ptr addrspace(1)> %vec, i32 %idx) {
 ; GCN-LABEL: dyn_extract_v8p1_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1712,11 +1711,11 @@ define ptr addrspace(1) @dyn_extract_v8p1_v_v(<8 x ptr addrspace(1)> %vec, i32 %
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v8p1_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -1738,11 +1737,11 @@ define ptr addrspace(1) @dyn_extract_v8p1_v_v(<8 x ptr addrspace(1)> %vec, i32 %
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v8p1_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v16
@@ -1757,6 +1756,7 @@ define ptr addrspace(1) @dyn_extract_v8p1_v_v(<8 x ptr addrspace(1)> %vec, i32 %
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v16
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <8 x ptr addrspace(1)> %vec, i32 %idx
@@ -2204,7 +2204,6 @@ entry:
 define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v6f32_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v6
@@ -2215,11 +2214,11 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v6f32_v_v:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v6
@@ -2230,6 +2229,7 @@ define float @dyn_extract_v6f32_v_v(<6 x float> %vec, i32 %sel) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v6
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <6 x float> %vec, i32 %sel
@@ -2352,7 +2352,6 @@ entry:
 define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v7f32_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v7
@@ -2365,11 +2364,11 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v7
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v7f32_v_v:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v7
@@ -2382,6 +2381,7 @@ define float @dyn_extract_v7f32_v_v(<7 x float> %vec, i32 %sel) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 6, v7
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <7 x float> %vec, i32 %sel
@@ -2582,7 +2582,6 @@ entry:
 define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v6f64_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2598,11 +2597,11 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v12
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v6f64_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -2618,11 +2617,11 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v6f64_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v12
@@ -2633,6 +2632,7 @@ define double @dyn_extract_v6f64_v_v(<6 x double> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 5, v12
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <6 x double> %vec, i32 %sel
@@ -2860,7 +2860,6 @@ entry:
 define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v7f64_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v14
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -2882,11 +2881,11 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v14
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v7f64_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -2908,11 +2907,11 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v14
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v7f64_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v14
@@ -2927,6 +2926,7 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v14
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <7 x double> %vec, i32 %sel
@@ -3389,7 +3389,6 @@ entry:
 define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GCN-LABEL: dyn_extract_v15f32_const_s_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, 2.0, vcc
@@ -3432,11 +3431,11 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v15f32_const_s_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
@@ -3467,11 +3466,11 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s4, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v15f32_const_s_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v0
@@ -3502,6 +3501,7 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <15 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>, i32 %sel
@@ -3649,7 +3649,6 @@ entry:
 define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
 ; GCN-LABEL: dyn_extract_v15f32_v_v:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v15
@@ -3680,11 +3679,11 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v15
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v15f32_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v15
@@ -3715,11 +3714,11 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v15f32_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 2, v15
@@ -3750,6 +3749,7 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v15
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <15 x float> %vec, i32 %sel
@@ -3882,7 +3882,6 @@ entry:
 define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GPRIDX-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    v_add_u32_e32 v15, 3, v15
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -3914,11 +3913,11 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v15
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; MOVREL:       ; %bb.0: ; %entry
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v15, vcc, 3, v15
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v15
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -3950,11 +3949,11 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
 ; MOVREL-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v15
 ; MOVREL-NEXT:    v_cndmask_b32_e32 v0, v0, v0, vcc
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v15, 3, v15
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -3986,11 +3985,11 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s4, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_extract_v15f32_v_v_offset3:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v15
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -4022,6 +4021,7 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 15, v15
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %sel, 3
@@ -4737,36 +4737,36 @@ entry:
 define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_7:
 ; GPRIDX:       ; %bb.0:
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v7
+; GPRIDX-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_7:
 ; MOVREL:       ; %bb.0:
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v7
+; MOVREL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v7
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v7
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 7
@@ -4776,32 +4776,28 @@ define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) {
 define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_32:
 ; GPRIDX:       ; %bb.0:
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_32:
 ; MOVREL:       ; %bb.0:
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; MOVREL-NEXT:    s_waitcnt vmcnt(0)
+; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:128
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 32
@@ -4811,36 +4807,36 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_33:
 ; GPRIDX:       ; %bb.0:
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v1
+; GPRIDX-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_33:
 ; MOVREL:       ; %bb.0:
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
+; MOVREL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:128
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 33
@@ -4850,36 +4846,36 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
 define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_37:
 ; GPRIDX:       ; %bb.0:
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v5
+; GPRIDX-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; MOVREL-LABEL: v_extract_v64i32_37:
 ; MOVREL:       ; %bb.0:
-; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x90, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v5
+; MOVREL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_extract_v64i32_37:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_extract_v64i32_37:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, ptr addrspace(1) %ptr
   %elt = extractelement <64 x i32> %vec, i32 37
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 29233c77650b919..4030406f727442f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -18,7 +18,6 @@
 define half @v_fdiv_f16(half %a, half %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -33,11 +32,11 @@ define half @v_fdiv_f16(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -55,56 +54,57 @@ define half @v_fdiv_f16(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_fdiv_f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half %a, %b
   ret half %fdiv
@@ -113,34 +113,34 @@ define half @v_fdiv_f16(half %a, half %b) {
 define half @v_fdiv_f16_afn(half %a, half %b) {
 ; GFX6-LABEL: v_fdiv_f16_afn:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_fdiv_f16_afn:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn half %a, %b
   ret half %fdiv
@@ -149,7 +149,6 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
 define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -164,11 +163,11 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -186,56 +185,57 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f16_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half %a, %b
   ret half %fdiv
@@ -244,7 +244,6 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 define half @v_neg_rcp_f16(half %x) {
 ; GFX6-IEEE-LABEL: v_neg_rcp_f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -259,11 +258,11 @@ define half @v_neg_rcp_f16(half %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -281,18 +280,19 @@ define half @v_neg_rcp_f16(half %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rcp_f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_neg_rcp_f16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half -1.0, %x
   ret half %fdiv
@@ -301,7 +301,6 @@ define half @v_neg_rcp_f16(half %x) {
 define half @v_rcp_f16(half %x) {
 ; GFX6-IEEE-LABEL: v_rcp_f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -316,11 +315,11 @@ define half @v_rcp_f16(half %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -338,18 +337,19 @@ define half @v_rcp_f16(half %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rcp_f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rcp_f16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half 1.0, %x
   ret half %fdiv
@@ -358,7 +358,6 @@ define half @v_rcp_f16(half %x) {
 define half @v_rcp_f16_arcp(half %x) {
 ; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -373,11 +372,11 @@ define half @v_rcp_f16_arcp(half %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -395,18 +394,19 @@ define half @v_rcp_f16_arcp(half %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rcp_f16_arcp:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rcp_f16_arcp:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp half 1.0, %x
   ret half %fdiv
@@ -415,24 +415,24 @@ define half @v_rcp_f16_arcp(half %x) {
 define half @v_rcp_f16_arcp_afn(half %x) {
 ; GFX6-LABEL: v_rcp_f16_arcp_afn:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rcp_f16_arcp_afn:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rcp_f16_arcp_afn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn half 1.0, %x
   ret half %fdiv
@@ -441,7 +441,6 @@ define half @v_rcp_f16_arcp_afn(half %x) {
 define half @v_rcp_f16_ulp25(half %x) {
 ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -456,11 +455,11 @@ define half @v_rcp_f16_ulp25(half %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
@@ -478,18 +477,19 @@ define half @v_rcp_f16_ulp25(half %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rcp_f16_ulp25:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rcp_f16_ulp25:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half 1.0, %x
   ret half %fdiv
@@ -498,34 +498,34 @@ define half @v_rcp_f16_ulp25(half %x) {
 define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
 ; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn half %a, %b
   ret half %fdiv
@@ -534,7 +534,6 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
 define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -549,11 +548,11 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -571,28 +570,29 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_arcp_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp half %a, %b
   ret half %fdiv
@@ -601,7 +601,6 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
 define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -630,11 +629,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
@@ -668,11 +667,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
@@ -689,11 +688,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v4
@@ -709,11 +708,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_fdiv_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
@@ -725,11 +724,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, v5
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -741,11 +740,11 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -758,6 +757,7 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -766,7 +766,6 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -777,41 +776,41 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_afn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f16_afn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -820,6 +819,7 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -828,7 +828,6 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -857,11 +856,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
@@ -895,11 +894,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
@@ -916,11 +915,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v4
@@ -936,11 +935,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
@@ -952,11 +951,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, v5
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -968,11 +967,11 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -985,6 +984,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -993,7 +993,6 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -1021,11 +1020,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
@@ -1059,11 +1058,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1078,11 +1077,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1096,11 +1095,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_rcp_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1111,11 +1110,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1126,11 +1125,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1142,6 +1141,7 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
@@ -1150,7 +1150,6 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -1178,11 +1177,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
@@ -1216,11 +1215,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_neg_rcp_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1235,11 +1234,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1253,11 +1252,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1268,11 +1267,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rcp_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1283,11 +1282,11 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rcp_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1299,6 +1298,7 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
   ret <2 x half> %fdiv
@@ -1307,7 +1307,6 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16_fabs:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-IEEE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1340,11 +1339,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-FLUSH-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1383,11 +1382,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_fabs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -1403,11 +1402,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -1422,11 +1421,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -1438,11 +1437,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -1454,11 +1453,11 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -1471,6 +1470,7 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
@@ -1480,7 +1480,6 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-IEEE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1513,11 +1512,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v5, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-FLUSH-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1556,11 +1555,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v1, v5, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -1576,11 +1575,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -1595,11 +1594,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -1611,11 +1610,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -1627,11 +1626,11 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -1644,6 +1643,7 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
@@ -1653,7 +1653,6 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -1681,11 +1680,11 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
@@ -1719,40 +1718,41 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_arcp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX8-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_v2f16_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rcp_f16_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
@@ -1761,7 +1761,6 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
@@ -1771,40 +1770,41 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX8-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
 ; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rcp_f16_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
@@ -1813,7 +1813,6 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -1841,11 +1840,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
@@ -1879,11 +1878,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f16_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1898,11 +1897,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1916,11 +1915,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
@@ -1931,11 +1930,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1946,11 +1945,11 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1962,6 +1961,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
@@ -1970,7 +1970,6 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1981,41 +1980,41 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -2024,6 +2023,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -2032,7 +2032,6 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -2061,11 +2060,11 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
@@ -2099,41 +2098,41 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -2142,6 +2141,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -2150,7 +2150,6 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -2161,41 +2160,41 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
 ; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -2204,6 +2203,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn arcp <2 x half> %a, %b
   ret <2 x half> %fdiv
@@ -2999,7 +2999,6 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
 define half @v_rsq_f16(half %a) {
 ; GFX6-IEEE-LABEL: v_rsq_f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3017,11 +3016,11 @@ define half @v_rsq_f16(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3042,18 +3041,19 @@ define half @v_rsq_f16(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rsq_f16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract half 1.0, %sqrt
@@ -3063,7 +3063,6 @@ define half @v_rsq_f16(half %a) {
 define half @v_neg_rsq_f16(half %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3081,11 +3080,11 @@ define half @v_neg_rsq_f16(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3106,28 +3105,29 @@ define half @v_neg_rsq_f16(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rsq_f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract half -1.0, %sqrt
@@ -3137,7 +3137,6 @@ define half @v_neg_rsq_f16(half %a) {
 define { half, half } @v_rsq_f16_multi_use(half %a) {
 ; GFX6-IEEE-LABEL: v_rsq_f16_multi_use:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3155,11 +3154,11 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_f16_multi_use:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3180,31 +3179,32 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_multi_use:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v2, v0
 ; GFX89-NEXT:    v_rsq_f16_e32 v1, v0
 ; GFX89-NEXT:    v_mov_b32_e32 v0, v2
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_f16_multi_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v2, v0
 ; GFX10-NEXT:    v_rsq_f16_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_f16_multi_use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v2, v0
 ; GFX11-NEXT:    v_rsq_f16_e32 v1, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %insert.0 = insertvalue { half, half } poison, half %sqrt, 0
@@ -3216,7 +3216,6 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
 define half @v_rsq_f16_missing_contract0(half %a) {
 ; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract0:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3234,11 +3233,11 @@ define half @v_rsq_f16_missing_contract0(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract0:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3259,28 +3258,29 @@ define half @v_rsq_f16_missing_contract0(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_missing_contract0:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract half 1.0, %sqrt
@@ -3290,7 +3290,6 @@ define half @v_rsq_f16_missing_contract0(half %a) {
 define half @v_rsq_f16_missing_contract1(half %a) {
 ; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract1:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3308,11 +3307,11 @@ define half @v_rsq_f16_missing_contract1(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract1:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3333,28 +3332,29 @@ define half @v_rsq_f16_missing_contract1(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_missing_contract1:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_f16_missing_contract1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv half 1.0, %sqrt
@@ -3364,7 +3364,6 @@ define half @v_rsq_f16_missing_contract1(half %a) {
 define half @v_neg_rsq_f16_missing_contract0(half %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract0:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3382,11 +3381,11 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract0:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3407,28 +3406,29 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rsq_f16_missing_contract0:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract half -1.0, %sqrt
@@ -3438,7 +3438,6 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
 define half @v_neg_rsq_f16_missing_contract1(half %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3456,11 +3455,11 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3481,28 +3480,29 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv half -1.0, %sqrt
@@ -3512,7 +3512,6 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
 define half @v_neg_rsq_f16_fabs(half %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_fabs:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3530,11 +3529,11 @@ define half @v_neg_rsq_f16_fabs(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_fabs:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3555,28 +3554,29 @@ define half @v_neg_rsq_f16_fabs(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rsq_f16_fabs:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e64 v0, |v0|
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e64 v0, |v0|
 ; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e64 v0, |v0|
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
   %sqrt = call contract half @llvm.sqrt.f16(half %a.fabs)
@@ -3587,7 +3587,6 @@ define half @v_neg_rsq_f16_fabs(half %a) {
 define half @v_rsq_f16_arcp(half %a) {
 ; GFX6-IEEE-LABEL: v_rsq_f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3605,11 +3604,11 @@ define half @v_rsq_f16_arcp(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3630,18 +3629,19 @@ define half @v_rsq_f16_arcp(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_arcp:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rsq_f16_arcp:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract arcp half 1.0, %sqrt
@@ -3651,7 +3651,6 @@ define half @v_rsq_f16_arcp(half %a) {
 define half @v_neg_rsq_f16_arcp(half %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_f16_arcp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3669,11 +3668,11 @@ define half @v_neg_rsq_f16_arcp(half %a) {
 ; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_f16_arcp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
 ; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3694,28 +3693,29 @@ define half @v_neg_rsq_f16_arcp(half %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_neg_rsq_f16_arcp:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_f16_arcp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_f16_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv contract arcp half -1.0, %sqrt
@@ -3725,7 +3725,6 @@ define half @v_neg_rsq_f16_arcp(half %a) {
 define half @v_rsq_f16_afn(half %a) {
 ; GFX6-LABEL: v_rsq_f16_afn:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3734,18 +3733,19 @@ define half @v_rsq_f16_afn(half %a) {
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_afn:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_rsq_f16_afn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv afn contract half 1.0, %sqrt
@@ -3755,7 +3755,6 @@ define half @v_rsq_f16_afn(half %a) {
 define half @v_rsq_f16_afn_nocontract(half %a) {
 ; GFX6-LABEL: v_rsq_f16_afn_nocontract:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
 ; GFX6-NEXT:    v_sqrt_f32_e32 v0, v0
@@ -3764,28 +3763,29 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v_rsq_f16_afn_nocontract:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX89-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_f16_afn_nocontract:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_f16_afn_nocontract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv afn half 1.0, %sqrt
@@ -3795,7 +3795,6 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
 define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-LABEL: v_rsq_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
@@ -3830,11 +3829,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_rsq_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
@@ -3874,11 +3873,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rsq_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX8-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
@@ -3894,11 +3893,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
@@ -3913,11 +3912,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_rsq_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
@@ -3929,11 +3928,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
@@ -3945,11 +3944,11 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
@@ -3964,6 +3963,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
   %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
@@ -3973,7 +3973,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
@@ -4008,11 +4007,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
@@ -4052,11 +4051,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
 ; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_neg_rsq_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX8-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
@@ -4072,11 +4071,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX8-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
-; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
@@ -4091,11 +4090,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
-; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
@@ -4107,11 +4106,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
@@ -4123,11 +4122,11 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
@@ -4142,6 +4141,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
   %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 1a8529b9101c3fb..1d781e341f8138e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -20,7 +20,6 @@
 define float @v_fdiv_f32(float %a, float %b) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -32,11 +31,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -50,11 +49,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -66,11 +65,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -84,11 +83,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
@@ -100,11 +99,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
@@ -118,11 +117,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
@@ -134,11 +133,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v2, -v2, v4, v5
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
@@ -152,11 +151,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -173,11 +172,11 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -196,6 +195,7 @@ define float @v_fdiv_f32(float %a, float %b) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float %a, %b
   ret float %fdiv
@@ -205,7 +205,6 @@ define float @v_fdiv_f32(float %a, float %b) {
 define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -220,11 +219,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -239,11 +238,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -258,11 +257,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -277,11 +276,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -296,11 +295,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -315,11 +314,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -334,11 +333,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -353,11 +352,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -377,11 +376,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -401,6 +400,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float %a, %b
   ret float %fdiv
@@ -409,24 +409,24 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
 define float @v_fdiv_f32_afn(float %a, float %b) {
 ; GCN-LABEL: v_fdiv_f32_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float %a, %b
   ret float %fdiv
@@ -435,7 +435,6 @@ define float @v_fdiv_f32_afn(float %a, float %b) {
 define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -449,11 +448,11 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_f32_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_mov_b32_e32 v2, 0x6f800000
 ; GCN-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x2f800000
 ; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, v2
@@ -462,11 +461,11 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -475,22 +474,22 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX10-IEEE-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -502,11 +501,11 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX11-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
@@ -516,6 +515,7 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-FLUSH-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float %a, %b, !fpmath !0
   ret float %fdiv
@@ -568,7 +568,6 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -582,11 +581,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -600,11 +599,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -613,11 +612,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -629,6 +628,7 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; EG:       ; %bb.0:
@@ -641,7 +641,6 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 define float @v_rcp_f32(float %x) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_rcp_f32:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -653,11 +652,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_rcp_f32:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -671,11 +670,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_rcp_f32:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
@@ -687,11 +686,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_rcp_f32:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
@@ -705,11 +704,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_f32:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
@@ -721,11 +720,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_rcp_f32:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
@@ -739,11 +738,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_f32:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
@@ -755,11 +754,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v1, -v1, v3, v4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_f32:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
@@ -773,11 +772,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_f32:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -794,11 +793,11 @@ define float @v_rcp_f32(float %x) {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_f32:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -817,6 +816,7 @@ define float @v_rcp_f32(float %x) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float 1.0, %x
   ret float %fdiv
@@ -825,7 +825,6 @@ define float @v_rcp_f32(float %x) {
 define float @v_rcp_f32_arcp(float %x) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_rcp_f32_arcp:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -837,11 +836,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_rcp_f32_arcp:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -855,11 +854,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_rcp_f32_arcp:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
@@ -871,11 +870,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_rcp_f32_arcp:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
@@ -889,11 +888,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_f32_arcp:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
@@ -905,11 +904,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX89-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_rcp_f32_arcp:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
@@ -923,11 +922,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_f32_arcp:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
@@ -939,11 +938,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX10-IEEE-NEXT:    v_fma_f32 v1, -v1, v3, v4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_f32_arcp:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v1
@@ -957,11 +956,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_f32_arcp:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -978,11 +977,11 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v3
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_f32_arcp:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -1001,6 +1000,7 @@ define float @v_rcp_f32_arcp(float %x) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float 1.0, %x
   ret float %fdiv
@@ -1009,20 +1009,20 @@ define float @v_rcp_f32_arcp(float %x) {
 define float @v_rcp_f32_arcp_afn(float %x) {
 ; GCN-LABEL: v_rcp_f32_arcp_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_f32_arcp_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_f32_arcp_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn float 1.0, %x
   ret float %fdiv
@@ -1031,7 +1031,6 @@ define float @v_rcp_f32_arcp_afn(float %x) {
 define float @v_rcp_f32_ulp25(float %x) {
 ; GFX6-IEEE-LABEL: v_rcp_f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -1040,33 +1039,33 @@ define float @v_rcp_f32_ulp25(float %x) {
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_rcp_f32_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_f32_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-IEEE-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_f32_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_f32_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1074,12 +1073,13 @@ define float @v_rcp_f32_ulp25(float %x) {
 ; GFX11-IEEE-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
 ; GFX11-IEEE-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_f32_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float 1.0, %x, !fpmath !0
   ret float %fdiv
@@ -1088,24 +1088,24 @@ define float @v_rcp_f32_ulp25(float %x) {
 define float @v_fdiv_f32_afn_ulp25(float %a, float %b) {
 ; GCN-LABEL: v_fdiv_f32_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float %a, %b, !fpmath !0
   ret float %fdiv
@@ -1114,7 +1114,6 @@ define float @v_fdiv_f32_afn_ulp25(float %a, float %b) {
 define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -1124,36 +1123,36 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-IEEE-NEXT:    v_sub_nc_u32_e32 v1, 0, v1
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1163,14 +1162,15 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float %a, %b, !fpmath !0
   ret float %fdiv
@@ -1179,7 +1179,6 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
 define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_v2f32:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
@@ -1202,11 +1201,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v5, v6, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_v2f32:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
@@ -1233,11 +1232,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v6
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_v2f32:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
@@ -1261,11 +1260,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_v2f32:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v2, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v6, v4
@@ -1292,11 +1291,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v5, v2, v4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_v2f32:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v3, v3, v1
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v0, v2, v0
@@ -1320,11 +1319,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v9, v11
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_v2f32:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v0, v2, v0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v6, v4
@@ -1351,11 +1350,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v5, v2, v4
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_v2f32:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, s4, v2, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v5, s4, v3, v3, v1
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v10, vcc_lo, v0, v2, v0
@@ -1379,11 +1378,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v5, v5, v7, v11
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, s4, v2, v2, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v6, vcc_lo, v0, v2, v0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v4
@@ -1410,11 +1409,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v5, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_v2f32:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, null, v2, v2, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v5, null, v3, v3, v1
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v10, vcc_lo, v0, v2, v0
@@ -1446,11 +1445,11 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v4, v2, v0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v1, v5, v3, v1
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_v2f32:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, null, v2, v2, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v6, vcc_lo, v0, v2, v0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -1487,6 +1486,7 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v5, v4
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v3, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> %a, %b
   ret <2 x float> %fdiv
@@ -1495,29 +1495,29 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
 define <2 x float> @v_fdiv_v2f32_afn(<2 x float> %a, <2 x float> %b) {
 ; GCN-LABEL: v_fdiv_v2f32_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f32_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f32_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x float> %a, %b
   ret <2 x float> %fdiv
@@ -1526,7 +1526,6 @@ define <2 x float> @v_fdiv_v2f32_afn(<2 x float> %a, <2 x float> %b) {
 define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v5
@@ -1552,11 +1551,11 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x6f800000
 ; GCN-FLUSH-NEXT:    v_mov_b32_e32 v5, 0x2f800000
 ; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, v4
@@ -1571,11 +1570,11 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v6, v0
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v3
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -1592,11 +1591,11 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v2|
 ; GFX10-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4
 ; GFX10-FLUSH-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v3|
@@ -1609,11 +1608,11 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v5, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v3
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -1632,11 +1631,11 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v2|
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s0
@@ -1650,6 +1649,7 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-FLUSH-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
 ; GFX11-FLUSH-NEXT:    v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
@@ -1658,7 +1658,6 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_rcp_v2f32:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -1681,11 +1680,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v3, v5, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_rcp_v2f32:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -1712,11 +1711,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_rcp_v2f32:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -1740,11 +1739,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_rcp_v2f32:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -1771,11 +1770,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_v2f32:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -1799,11 +1798,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_rcp_v2f32:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
@@ -1830,11 +1829,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_v2f32:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v1, v1, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
@@ -1858,11 +1857,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v9
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_v2f32:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
@@ -1889,11 +1888,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_v2f32:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v0, v0, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v3, null, v1, v1, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
@@ -1925,11 +1924,11 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_v2f32:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -1966,6 +1965,7 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
   ret <2 x float> %fdiv
@@ -1974,7 +1974,6 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
 define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-IEEE-FASTFMA-LABEL: v_rcp_v2f32_arcp:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -1997,11 +1996,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v3, v5, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_rcp_v2f32_arcp:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -2028,11 +2027,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_rcp_v2f32_arcp:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -2056,11 +2055,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_rcp_v2f32_arcp:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2087,11 +2086,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -2115,11 +2114,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v7, v9
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
 ; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2146,11 +2145,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_v2f32_arcp:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v1, v1, 1.0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
@@ -2174,11 +2173,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v9
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_v2f32_arcp:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
@@ -2205,11 +2204,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX10-FLUSH-NEXT:    s_denorm_mode 0
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_v2f32_arcp:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v0, v0, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v3, null, v1, v1, 1.0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0
@@ -2241,11 +2240,11 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_v2f32_arcp:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -2282,6 +2281,7 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
   ret <2 x float> %fdiv
@@ -2290,23 +2290,23 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
 define <2 x float> @v_rcp_v2f32_arcp_afn(<2 x float> %x) {
 ; GCN-LABEL: v_rcp_v2f32_arcp_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f32_arcp_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f32_arcp_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn <2 x float> <float 1.0, float 1.0>, %x
   ret <2 x float> %fdiv
@@ -2315,7 +2315,6 @@ define <2 x float> @v_rcp_v2f32_arcp_afn(<2 x float> %x) {
 define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
@@ -2331,18 +2330,18 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_rcp_v2f32_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_rcp_v2f32_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
@@ -2353,18 +2352,18 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX10-IEEE-NEXT:    v_sub_nc_u32_e32 v1, 0, v1
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v0, v2, v0
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_rcp_v2f32_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_rcp_v2f32_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
@@ -2378,13 +2377,14 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX11-IEEE-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v0, v2, v0
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v1, v3, v1
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_rcp_v2f32_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
   ret <2 x float> %fdiv
@@ -2393,29 +2393,29 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GCN-LABEL: v_fdiv_v2f32_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f32_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f32_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
@@ -2424,7 +2424,6 @@ define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) {
 define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v5
@@ -2442,20 +2441,20 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v3, vcc, 0, v3
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v2, v2, v3
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
-; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX10-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v3
 ; GFX10-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -2468,20 +2467,20 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX10-IEEE-NEXT:    v_ldexp_f32 v3, v5, v3
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
 ; GFX11-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v3
 ; GFX11-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -2497,15 +2496,16 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX11-IEEE-NEXT:    v_ldexp_f32 v3, v5, v3
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-FLUSH-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
@@ -2514,29 +2514,29 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 define <2 x float> @v_fdiv_v2f32_arcp_afn_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GCN-LABEL: v_fdiv_v2f32_arcp_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f32_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f32_arcp_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn arcp <2 x float> %a, %b, !fpmath !0
   ret <2 x float> %fdiv
@@ -2610,7 +2610,6 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2625,11 +2624,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2644,11 +2643,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2663,11 +2662,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2682,11 +2681,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2701,11 +2700,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2720,11 +2719,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2739,11 +2738,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2758,11 +2757,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -2782,11 +2781,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -2806,6 +2805,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; EG:       ; %bb.0:
@@ -2862,7 +2862,6 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -2876,11 +2875,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -2894,11 +2893,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2907,11 +2906,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2923,6 +2922,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; EG:       ; %bb.0:
@@ -2982,7 +2982,6 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
@@ -2997,11 +2996,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-IEEE-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
@@ -3016,11 +3015,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3030,11 +3029,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3047,6 +3046,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; EG:       ; %bb.0:
@@ -3125,7 +3125,6 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3141,11 +3140,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3161,11 +3160,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v2, v1, v2
@@ -3181,11 +3180,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v2, v1, v2
@@ -3201,11 +3200,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v2, v1, v2
@@ -3221,11 +3220,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v2
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, v2, v1, v2
@@ -3241,11 +3240,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v2
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, vcc_lo, v2, v1, v2
@@ -3261,11 +3260,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v2
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, vcc_lo, v2, v1, v2
@@ -3281,11 +3280,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3307,11 +3306,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3333,6 +3332,7 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; EG:       ; %bb.0:
@@ -3390,7 +3390,6 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -3404,11 +3403,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -3422,11 +3421,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -3435,11 +3434,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -3451,6 +3450,7 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; EG:       ; %bb.0:
@@ -3529,7 +3529,6 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3545,11 +3544,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3565,11 +3564,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, v2, v0
@@ -3585,11 +3584,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, v2, v0
@@ -3605,11 +3604,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v0, v2, v0
@@ -3625,11 +3624,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, v0, v2, v0
@@ -3645,11 +3644,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, s4, v2, v2, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, vcc_lo, v0, v2, v0
@@ -3665,11 +3664,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, s4, v2, v2, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, vcc_lo, v0, v2, v0
@@ -3685,11 +3684,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3711,11 +3710,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -3737,6 +3736,7 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #0 {
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; EG:       ; %bb.0:
@@ -3819,7 +3819,6 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
@@ -3835,11 +3834,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
@@ -3855,11 +3854,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
@@ -3875,11 +3874,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
@@ -3895,11 +3894,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
@@ -3915,11 +3914,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
@@ -3935,11 +3934,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3954,11 +3953,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3973,11 +3972,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -3997,11 +3996,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4021,6 +4020,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; EG:       ; %bb.0:
@@ -4067,7 +4067,6 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, 0x3f40e400
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4078,11 +4077,11 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, 0x3f40e400
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4093,22 +4092,22 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, 0x3f40e400
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, 14, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, 0x3f40e400
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
@@ -4117,6 +4116,7 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; EG:       ; %bb.0:
@@ -4198,7 +4198,6 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
@@ -4214,11 +4213,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
@@ -4234,11 +4233,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
@@ -4254,11 +4253,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
@@ -4274,11 +4273,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
@@ -4294,11 +4293,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x4640e400
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
@@ -4314,11 +4313,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x4640e400
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4333,11 +4332,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x4640e400
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4352,11 +4351,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x4640e400
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4376,11 +4375,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x4640e400
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4400,6 +4399,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; EG:       ; %bb.0:
@@ -4449,7 +4449,6 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #0 {
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -4460,11 +4459,11 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #0 {
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -4475,22 +4474,22 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #0 {
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 14, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4500,6 +4499,7 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #0 {
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; EG:       ; %bb.0:
@@ -4577,7 +4577,6 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -4592,11 +4591,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -4611,11 +4610,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4630,11 +4629,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4649,11 +4648,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4668,11 +4667,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4687,11 +4686,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4706,11 +4705,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4725,11 +4724,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4749,11 +4748,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4773,6 +4772,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; EG:       ; %bb.0:
@@ -4829,7 +4829,6 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -4843,11 +4842,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -4861,11 +4860,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4874,11 +4873,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4890,6 +4889,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; EG:       ; %bb.0:
@@ -4966,7 +4966,6 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ;
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
-; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -4981,11 +4980,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
-; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -5000,11 +4999,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
-; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5019,11 +5018,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5038,11 +5037,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX89-IEEE:       ; %bb.0:
-; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5057,11 +5056,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX89-FLUSH:       ; %bb.0:
-; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5076,11 +5075,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX10-IEEE:       ; %bb.0:
-; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5095,11 +5094,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX10-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX10-FLUSH:       ; %bb.0:
-; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5114,11 +5113,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX10-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX11-IEEE:       ; %bb.0:
-; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-IEEE-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-IEEE-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -5138,11 +5137,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX11-IEEE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-FLUSH-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-FLUSH-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -5162,6 +5161,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; EG:       ; %bb.0:
@@ -5218,7 +5218,6 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ;
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX6-IEEE:       ; %bb.0:
-; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -5232,11 +5231,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX6-FLUSH:       ; %bb.0:
-; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
@@ -5250,11 +5249,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-FLUSH-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5263,11 +5262,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5279,6 +5278,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; EG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
index 78155453d908af1..f3223f79f67fd11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll
@@ -17,7 +17,6 @@
 define double @v_fdiv_f64(double %a, double %b) {
 ; GFX6-LABEL: v_fdiv_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -32,11 +31,11 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX6-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX8-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -48,11 +47,11 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX8-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX8-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -64,11 +63,11 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -80,11 +79,11 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -101,6 +100,7 @@ define double @v_fdiv_f64(double %a, double %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv double %a, %b
   ret double %fdiv
@@ -109,7 +109,6 @@ define double @v_fdiv_f64(double %a, double %b) {
 define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GCN-LABEL: v_fdiv_f64_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -118,11 +117,11 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GCN-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f64_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -131,11 +130,11 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f64_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -148,6 +147,7 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn double %a, %b
   ret double %fdiv
@@ -156,7 +156,6 @@ define double @v_fdiv_f64_afn(double %a, double %b) {
 define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX6-LABEL: v_fdiv_f64_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -171,11 +170,11 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX6-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f64_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX8-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -187,11 +186,11 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX8-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX8-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_f64_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -203,11 +202,11 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f64_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -219,11 +218,11 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f64_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -240,6 +239,7 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv double %a, %b, !fpmath !0
   ret double %fdiv
@@ -248,7 +248,6 @@ define double @v_fdiv_f64_ulp25(double %a, double %b) {
 define double @v_rcp_f64(double %x) {
 ; GFX6-LABEL: v_rcp_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -264,11 +263,11 @@ define double @v_rcp_f64(double %x) {
 ; GFX6-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX8-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -280,11 +279,11 @@ define double @v_rcp_f64(double %x) {
 ; GFX8-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX8-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -296,11 +295,11 @@ define double @v_rcp_f64(double %x) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX9-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -312,11 +311,11 @@ define double @v_rcp_f64(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -333,6 +332,7 @@ define double @v_rcp_f64(double %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv double 1.0, %x
   ret double %fdiv
@@ -341,7 +341,6 @@ define double @v_rcp_f64(double %x) {
 define double @v_rcp_f64_arcp(double %x) {
 ; GFX6-LABEL: v_rcp_f64_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -357,11 +356,11 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX6-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_f64_arcp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX8-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -373,11 +372,11 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX8-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX8-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_f64_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -389,11 +388,11 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX9-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_f64_arcp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -405,11 +404,11 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_f64_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -426,6 +425,7 @@ define double @v_rcp_f64_arcp(double %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp double 1.0, %x
   ret double %fdiv
@@ -434,7 +434,6 @@ define double @v_rcp_f64_arcp(double %x) {
 define double @v_rcp_f64_arcp_afn(double %x) {
 ; GCN-LABEL: v_rcp_f64_arcp_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
 ; GCN-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
@@ -442,11 +441,11 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_f64_arcp_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
@@ -454,11 +453,11 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_f64_arcp_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[2:3], v[0:1]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0
@@ -470,6 +469,7 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn double 1.0, %x
   ret double %fdiv
@@ -478,7 +478,6 @@ define double @v_rcp_f64_arcp_afn(double %x) {
 define double @v_rcp_f64_ulp25(double %x) {
 ; GFX6-LABEL: v_rcp_f64_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -494,11 +493,11 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX6-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_f64_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX8-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -510,11 +509,11 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX8-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX8-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_f64_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX9-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -526,11 +525,11 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; GFX9-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_f64_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -542,11 +541,11 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_f64_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -563,6 +562,7 @@ define double @v_rcp_f64_ulp25(double %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv double 1.0, %x, !fpmath !0
   ret double %fdiv
@@ -571,7 +571,6 @@ define double @v_rcp_f64_ulp25(double %x) {
 define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GCN-LABEL: v_fdiv_f64_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -580,11 +579,11 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GCN-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f64_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
@@ -593,11 +592,11 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GFX10-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f64_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
@@ -610,6 +609,7 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1]
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn double %a, %b, !fpmath !0
   ret double %fdiv
@@ -618,7 +618,6 @@ define double @v_fdiv_f64_afn_ulp25(double %a, double %b) {
 define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX6-LABEL: v_fdiv_f64_arcp_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -633,11 +632,11 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX6-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f64_arcp_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX8-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX8-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -649,11 +648,11 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX8-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX8-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_f64_arcp_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
 ; GFX9-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
 ; GFX9-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
@@ -665,11 +664,11 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX9-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; GFX9-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f64_arcp_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -681,11 +680,11 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX10-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; GFX10-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f64_arcp_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -702,6 +701,7 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp double %a, %b, !fpmath !0
   ret double %fdiv
@@ -710,7 +710,6 @@ define double @v_fdiv_f64_arcp_ulp25(double %a, double %b) {
 define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX6-LABEL: v_fdiv_v2f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
@@ -740,11 +739,11 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -768,11 +767,11 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -796,11 +795,11 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -824,11 +823,11 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3]
 ; GFX11-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -861,6 +860,7 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> %a, %b
   ret <2 x double> %fdiv
@@ -869,7 +869,6 @@ define <2 x double> @v_fdiv_v2f64(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GCN-LABEL: v_fdiv_v2f64_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -886,11 +885,11 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -907,11 +906,11 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -935,6 +934,7 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x double> %a, %b
   ret <2 x double> %fdiv
@@ -943,7 +943,6 @@ define <2 x double> @v_fdiv_v2f64_afn(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-LABEL: v_fdiv_v2f64_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
@@ -973,11 +972,11 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f64_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -1001,11 +1000,11 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f64_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -1029,11 +1028,11 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -1057,11 +1056,11 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3]
 ; GFX11-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -1094,6 +1093,7 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -1102,7 +1102,6 @@ define <2 x double> @v_fdiv_v2f64_ulp25(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6-LABEL: v_rcp_v2f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -1133,11 +1132,11 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1161,11 +1160,11 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1189,11 +1188,11 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1217,11 +1216,11 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1254,6 +1253,7 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1262,7 +1262,6 @@ define <2 x double> @v_rcp_v2f64(<2 x double> %x) {
 define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6-LABEL: v_rcp_v2f64_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -1293,11 +1292,11 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f64_arcp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1321,11 +1320,11 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_v2f64_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1349,11 +1348,11 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f64_arcp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1377,11 +1376,11 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f64_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1414,6 +1413,7 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1422,7 +1422,6 @@ define <2 x double> @v_rcp_v2f64_arcp(<2 x double> %x) {
 define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GCN-LABEL: v_rcp_v2f64_arcp_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; GCN-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
@@ -1437,11 +1436,11 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f64_arcp_afn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0
@@ -1456,11 +1455,11 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f64_arcp_afn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
 ; GFX11-NEXT:    v_rcp_f64_e32 v[6:7], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -1481,6 +1480,7 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn <2 x double> <double 1.0, double 1.0>, %x
   ret <2 x double> %fdiv
@@ -1489,7 +1489,6 @@ define <2 x double> @v_rcp_v2f64_arcp_afn(<2 x double> %x) {
 define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6-LABEL: v_rcp_v2f64_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX6-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
@@ -1520,11 +1519,11 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rcp_v2f64_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX8-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1548,11 +1547,11 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rcp_v2f64_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1576,11 +1575,11 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f64_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0
 ; GFX10-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1604,11 +1603,11 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f64_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0
@@ -1641,6 +1640,7 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x double> <double 1.0, double 1.0>, %x, !fpmath !0
   ret <2 x double> %fdiv
@@ -1649,7 +1649,6 @@ define <2 x double> @v_rcp_v2f64_ulp25(<2 x double> %x) {
 define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GCN-LABEL: v_fdiv_v2f64_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -1666,11 +1665,11 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -1687,11 +1686,11 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -1715,6 +1714,7 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -1723,7 +1723,6 @@ define <2 x double> @v_fdiv_v2f64_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-LABEL: v_fdiv_v2f64_arcp_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_div_scale_f64 v[16:17], s[4:5], v[0:1], v[4:5], v[0:1]
 ; GFX6-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
@@ -1753,11 +1752,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX6-NEXT:    s_nop 0
 ; GFX6-NEXT:    v_div_fmas_f64 v[8:9], v[14:15], v[8:9], v[16:17]
 ; GFX6-NEXT:    v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_v2f64_arcp_ulp25:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX8-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -1781,11 +1780,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX8-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX8-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX8-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_v2f64_arcp_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3]
 ; GFX9-NEXT:    v_div_scale_f64 v[20:21], s[4:5], v[2:3], v[6:7], v[2:3]
@@ -1809,11 +1808,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX9-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX9-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX9-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64_arcp_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3]
 ; GFX10-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -1837,11 +1836,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX10-NEXT:    v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23]
 ; GFX10-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX10-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64_arcp_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3]
 ; GFX11-NEXT:    v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1]
@@ -1874,6 +1873,7 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1]
 ; GFX11-NEXT:    v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
@@ -1882,7 +1882,6 @@ define <2 x double> @v_fdiv_v2f64_arcp_ulp25(<2 x double> %a, <2 x double> %b) {
 define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %b) {
 ; GCN-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GCN-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GCN-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -1899,11 +1898,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX10-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX10-NEXT:    v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0
@@ -1920,11 +1919,11 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3]
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f64_arcp_afn_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f64_e32 v[8:9], v[4:5]
 ; GFX11-NEXT:    v_rcp_f64_e32 v[10:11], v[6:7]
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -1948,6 +1947,7 @@ define <2 x double> @v_fdiv_v2f64_arcp_afn_ulp25(<2 x double> %a, <2 x double> %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0
   ret <2 x double> %fdiv
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..b4e44c60ad1be4d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -164,7 +164,6 @@ bb:
 define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, s32, v1
@@ -174,12 +173,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s32, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
@@ -189,12 +187,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX940-NEXT:    v_add_u32_e32 v1, s32, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
@@ -203,12 +200,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -217,7 +213,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s32 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -232,35 +228,33 @@ bb:
 define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -449,7 +443,6 @@ bb:
 define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -463,12 +456,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
@@ -482,12 +474,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -499,12 +490,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x100
@@ -515,7 +505,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -716,7 +706,6 @@ bb:
 define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -730,12 +719,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -749,12 +737,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_large_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
@@ -767,12 +754,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -784,7 +770,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, s0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -873,7 +859,6 @@ bb:
 define void @store_load_large_imm_offset_foo() {
 ; GFX9-LABEL: store_load_large_imm_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX9-NEXT:    s_add_i32 s1, s32, 4
@@ -884,12 +869,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_large_imm_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_movk_i32 s0, 0x3e80
@@ -900,12 +884,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_large_imm_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
@@ -914,12 +897,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
@@ -927,7 +909,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:4 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -1019,45 +1001,41 @@ bb:
 define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_aligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i64_aligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i64_aligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i64_aligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -1068,45 +1046,41 @@ bb:
 define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i64_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i64_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
 ; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i64_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -1117,7 +1091,6 @@ bb:
 define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v3i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s2, 3
 ; GFX9-NEXT:    s_mov_b32 s1, 2
 ; GFX9-NEXT:    s_mov_b32 s0, 1
@@ -1127,12 +1100,11 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_v3i32_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s2, 3
 ; GFX10-NEXT:    s_mov_b32 s1, 2
 ; GFX10-NEXT:    s_mov_b32 s0, 1
@@ -1142,12 +1114,11 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_v3i32_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s2, 3
 ; GFX940-NEXT:    s_mov_b32 s1, 2
 ; GFX940-NEXT:    s_mov_b32 s0, 1
@@ -1157,12 +1128,11 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_v3i32_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, 3
 ; GFX11-NEXT:    s_mov_b32 s1, 2
 ; GFX11-NEXT:    s_mov_b32 s0, 1
@@ -1171,7 +1141,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -1182,7 +1152,6 @@ bb:
 define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v4i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 4
 ; GFX9-NEXT:    s_mov_b32 s2, 3
 ; GFX9-NEXT:    s_mov_b32 s1, 2
@@ -1194,12 +1163,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_v4i32_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s3, 4
 ; GFX10-NEXT:    s_mov_b32 s2, 3
 ; GFX10-NEXT:    s_mov_b32 s1, 2
@@ -1211,12 +1179,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_v4i32_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s3, 4
 ; GFX940-NEXT:    s_mov_b32 s2, 3
 ; GFX940-NEXT:    s_mov_b32 s1, 2
@@ -1226,12 +1193,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_v4i32_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 4
 ; GFX11-NEXT:    s_mov_b32 s2, 3
 ; GFX11-NEXT:    s_mov_b32 s1, 2
@@ -1241,7 +1207,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
index 34635b077cd92d0..11f07399f14c19e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll
@@ -6,7 +6,6 @@
 define double @v_floor_f64_ieee(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -15,12 +14,13 @@ define double @v_floor_f64_ieee(double %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_ieee:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.floor.f64(double %x)
   ret double %result
@@ -29,18 +29,18 @@ define double @v_floor_f64_ieee(double %x) {
 define double @v_floor_f64_ieee_nnan(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee_nnan:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_ieee_nnan:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan double @llvm.floor.f64(double %x)
   ret double %result
@@ -49,7 +49,6 @@ define double @v_floor_f64_ieee_nnan(double %x) {
 define double @v_floor_f64_ieee_fneg(double %x) {
 ; GFX6-LABEL: v_floor_f64_ieee_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -58,12 +57,13 @@ define double @v_floor_f64_ieee_fneg(double %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_ieee_fneg:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e64 v[0:1], -v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %result = call double @llvm.floor.f64(double %neg.x)
@@ -73,7 +73,6 @@ define double @v_floor_f64_ieee_fneg(double %x) {
 define double @v_floor_f64_nonieee(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_nonieee:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -82,12 +81,13 @@ define double @v_floor_f64_nonieee(double %x) #1 {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_nonieee:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.floor.f64(double %x)
   ret double %result
@@ -96,18 +96,18 @@ define double @v_floor_f64_nonieee(double %x) #1 {
 define double @v_floor_f64_nonieee_nnan(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_nonieee_nnan:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_min_f64 v[2:3], v[2:3], s[4:5]
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_nonieee_nnan:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan double @llvm.floor.f64(double %x)
   ret double %result
@@ -116,7 +116,6 @@ define double @v_floor_f64_nonieee_nnan(double %x) #1 {
 define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 ; GFX6-LABEL: v_floor_f64_non_ieee_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -v[0:1]
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -125,12 +124,13 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_non_ieee_fneg:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e64 v[0:1], -v[0:1]
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %result = call double @llvm.floor.f64(double %neg.x)
@@ -140,7 +140,6 @@ define double @v_floor_f64_non_ieee_fneg(double %x) #1 {
 define double @v_floor_f64_fabs(double %x) {
 ; GFX6-LABEL: v_floor_f64_fabs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], |v[0:1]|
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -149,12 +148,13 @@ define double @v_floor_f64_fabs(double %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], |v[0:1]|, -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_fabs:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e64 v[0:1], |v[0:1]|
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %abs.x = call double @llvm.fabs.f64(double %x)
   %result = call double @llvm.floor.f64(double %abs.x)
@@ -164,7 +164,6 @@ define double @v_floor_f64_fabs(double %x) {
 define double @v_floor_f64_fneg_fabs(double %x) {
 ; GFX6-LABEL: v_floor_f64_fneg_fabs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e64 v[2:3], -|v[0:1]|
 ; GFX6-NEXT:    s_mov_b32 s4, -1
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
@@ -173,12 +172,13 @@ define double @v_floor_f64_fneg_fabs(double %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v1, vcc
 ; GFX6-NEXT:    v_add_f64 v[0:1], -|v[0:1]|, -v[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX78-LABEL: v_floor_f64_fneg_fabs:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_floor_f64_e64 v[0:1], -|v[0:1]|
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
   %abs.x = call double @llvm.fabs.f64(double %x)
   %neg.abs.x = fneg double %abs.x
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index 150c85f467d1a60..5665b6b55109e89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -8,32 +8,32 @@
 define float @v_fma_f32(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   ret float %fma
@@ -42,37 +42,37 @@ define float @v_fma_f32(float %x, float %y, float %z) {
 define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
 ; GFX6-LABEL: v_fma_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z)
   ret <2 x float> %fma
@@ -81,36 +81,36 @@ define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z)
 define half @v_fma_f16(half %x, half %y, half %z) {
 ; GFX6-LABEL: v_fma_f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %fma
@@ -119,36 +119,36 @@ define half @v_fma_f16(half %x, half %y, half %z) {
 define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
 ; GFX6-LABEL: v_fma_f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f16_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, -v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg half %x
   %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z)
@@ -158,36 +158,36 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) {
 define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
 ; GFX6-LABEL: v_fma_f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f16_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg half %y
   %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z)
@@ -197,36 +197,36 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) {
 define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
 ; GFX6-LABEL: v_fma_f16_fneg_add:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, -v2
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f16_fneg_add:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f16_fneg_add:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f16_fneg_add:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f16_fneg_add:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg half %z
   %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z)
@@ -236,7 +236,6 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) {
 define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -247,11 +246,11 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -259,24 +258,25 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z)
   ret <2 x half> %fma
@@ -285,7 +285,6 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -301,11 +300,11 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -314,24 +313,25 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f16_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z)
@@ -341,7 +341,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
 define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
@@ -357,11 +356,11 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -370,24 +369,25 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f16_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z)
@@ -397,7 +397,6 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
 define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -408,11 +407,11 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -420,24 +419,25 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f16_fneg_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y
@@ -448,7 +448,6 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
 define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX6-LABEL: v_fma_v3f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v6, v6
@@ -464,11 +463,11 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX6-NEXT:    v_fma_f32 v2, v2, v5, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v3f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
@@ -477,27 +476,28 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v3f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v3f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z)
   ret <3 x half> %fma
@@ -506,7 +506,6 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
 define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX6-LABEL: v_fma_v4f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v8, v8
@@ -527,11 +526,11 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX6-NEXT:    v_fma_f32 v3, v3, v6, v7
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
@@ -546,27 +545,28 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v4f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z)
   ret <4 x half> %fma
@@ -575,32 +575,32 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
 define double @v_fma_f64(double %x, double %y, double %z) {
 ; GFX6-LABEL: v_fma_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call double @llvm.fma.f64(double %x, double %y, double %z)
   ret double %fma
@@ -609,32 +609,32 @@ define double @v_fma_f64(double %x, double %y, double %z) {
 define double @v_fma_f64_fneg_all(double %x, double %y, double %z) {
 ; GFX6-LABEL: v_fma_f64_fneg_all:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f64_fneg_all:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f64_fneg_all:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f64_fneg_all:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f64_fneg_all:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %neg.y = fneg double %y
@@ -646,37 +646,37 @@ define double @v_fma_f64_fneg_all(double %x, double %y, double %z) {
 define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
 ; GFX6-LABEL: v_fma_v2f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX6-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_v2f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX8-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_v2f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z)
   ret <2 x double> %fma
@@ -685,32 +685,32 @@ define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double>
 define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fabs_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
@@ -720,32 +720,32 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
 define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, |v1|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z)
@@ -755,32 +755,32 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
 define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fabs_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
@@ -886,32 +886,32 @@ define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y,
 define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z)
@@ -921,32 +921,32 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
 define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z)
@@ -956,32 +956,32 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
 define float @v_fma_f32_fneg_z(float %x, float %y, float %z) {
 ; GFX6-LABEL: v_fma_f32_fneg_z:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fma_f32_fneg_z:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fma_f32_fneg_z:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_f32_fneg_z:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_f32_fneg_z:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg float %z
   %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
index 3d5f908c8667ff2..0352c8f31e2633e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll
@@ -6,15 +6,15 @@
 define float @v_test_fmax_legacy_ogt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ogt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ogt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -24,15 +24,15 @@ define float @v_test_fmax_legacy_ogt_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_oge_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_oge_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_oge_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp oge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -42,15 +42,15 @@ define float @v_test_fmax_legacy_oge_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_uge_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_uge_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_uge_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -60,15 +60,15 @@ define float @v_test_fmax_legacy_uge_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_ugt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ugt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ugt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -78,15 +78,15 @@ define float @v_test_fmax_legacy_ugt_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_ole_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ole_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ole_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ole float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -96,15 +96,15 @@ define float @v_test_fmax_legacy_ole_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_olt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_olt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_olt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp olt float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -114,15 +114,15 @@ define float @v_test_fmax_legacy_olt_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_ule_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ule_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ule_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -132,15 +132,15 @@ define float @v_test_fmax_legacy_ule_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_ult_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ult_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ult_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ult float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -150,15 +150,15 @@ define float @v_test_fmax_legacy_ult_f32(float %a, float %b) {
 define float @v_test_fmax_legacy_oge_f32_fneg_lhs(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_oge_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e64 v0, -v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_oge_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], -v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %a.neg = fneg float %a
   %cmp = fcmp oge float %a.neg, %b
@@ -169,15 +169,15 @@ define float @v_test_fmax_legacy_oge_f32_fneg_lhs(float %a, float %b) {
 define float @v_test_fmax_legacy_oge_f32_fneg_rhs(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_oge_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e64 v0, v0, -v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_oge_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_f32_e64 s[4:5], v0, -v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %b.neg = fneg float %b
   %cmp = fcmp oge float %a, %b.neg
@@ -188,16 +188,16 @@ define float @v_test_fmax_legacy_oge_f32_fneg_rhs(float %a, float %b) {
 define float @v_test_fcmp_select_ord(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_ord:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_ord:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ord float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -207,24 +207,22 @@ define float @v_test_fcmp_select_ord(float %a, float %b) {
 define float @v_test_fmax_legacy_ule_f32_multi_use(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ule_f32_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    ds_write_b32 v0, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ule_f32_multi_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write_b32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ogt float %a, %b
   %val0 = select i1 %cmp, float %a, float %b
@@ -236,18 +234,18 @@ define float @v_test_fmax_legacy_ule_f32_multi_use(float %a, float %b) {
 define double @v_test_fmax_legacy_ult_f64(double %a, double %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ult_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ult_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ult double %a, %b
   %val = select i1 %cmp, double %b, double %a
@@ -257,18 +255,18 @@ define double @v_test_fmax_legacy_ult_f64(double %a, double %b) {
 define <2 x float> @v_test_fmax_legacy_ogt_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-LABEL: v_test_fmax_legacy_ogt_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_max_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmax_legacy_ogt_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ogt <2 x float> %a, %b
   %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
index d6d36fe1acf35f7..2933127f500cc94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3-min-max-const-combine.ll
@@ -5,14 +5,14 @@
 define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 ; GFX10-LABEL: test_min_max_ValK0_K1_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_ValK0_K1_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -22,14 +22,14 @@ define float @test_min_max_ValK0_K1_f32(float %a) #0 {
 define float @test_min_max_K0Val_K1_f32(float %a) #1 {
 ; GFX10-LABEL: test_min_max_K0Val_K1_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_K0Val_K1_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float 2.0, float %a)
   %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -41,17 +41,17 @@ define float @test_min_max_K0Val_K1_f32(float %a) #1 {
 define half @test_min_K1max_ValK0_f16(half %a) #0 {
 ; GFX10-LABEL: test_min_K1max_ValK0_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX10-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_K1max_ValK0_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX8-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call half @llvm.maxnum.f16(half %a, half 2.0)
   %fmed = call half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -61,15 +61,15 @@ define half @test_min_K1max_ValK0_f16(half %a) #0 {
 define half @test_min_K1max_K0Val_f16(half %a) #1 {
 ; GFX10-LABEL: test_min_K1max_K0Val_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_K1max_K0Val_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan half @llvm.maxnum.f16(half 2.0, half %a)
   %fmed = call nnan half @llvm.minnum.f16(half 4.0, half %maxnum)
@@ -80,14 +80,14 @@ define half @test_min_K1max_K0Val_f16(half %a) #1 {
 define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 ; GFX10-LABEL: test_max_min_ValK1_K0_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_ValK1_K0_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 4.0)
   %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -97,14 +97,14 @@ define float @test_max_min_ValK1_K0_f32(float %a) #0 {
 define float @test_max_min_K1Val_K0_f32(float %a) #1 {
 ; GFX10-LABEL: test_max_min_K1Val_K0_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_K1Val_K0_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float 4.0, float %a)
   %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -114,15 +114,15 @@ define float @test_max_min_K1Val_K0_f32(float %a) #1 {
 define half @test_max_K0min_ValK1_f16(half %a) #0 {
 ; GFX10-LABEL: test_max_K0min_ValK1_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_K0min_ValK1_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f16_e32 v0, 4.0, v0
 ; GFX8-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan half @llvm.minnum.f16(half %a, half 4.0)
   %fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -132,15 +132,15 @@ define half @test_max_K0min_ValK1_f16(half %a) #0 {
 define half @test_max_K0min_K1Val_f16(half %a) #1 {
 ; GFX10-LABEL: test_max_K0min_K1Val_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f16 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_K0min_K1Val_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f16_e32 v0, 4.0, v0
 ; GFX8-NEXT:    v_max_f16_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan half @llvm.minnum.f16(half 4.0, half %a)
   %fmed = call nnan half @llvm.maxnum.f16(half 2.0, half %minnum)
@@ -152,14 +152,14 @@ define half @test_max_K0min_K1Val_f16(half %a) #1 {
 define float @test_min_max_global_nnan(float %a) #2 {
 ; GFX10-LABEL: test_min_max_global_nnan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_global_nnan:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -169,14 +169,14 @@ define float @test_min_max_global_nnan(float %a) #2 {
 define float @test_max_min_global_nnan(float %a) #2 {
 ; GFX10-LABEL: test_max_min_global_nnan:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_global_nnan:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
   %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -191,16 +191,16 @@ define float @test_max_min_global_nnan(float %a) #2 {
 define float @test_min_max_K0_gt_K1(float %a) #0 {
 ; GFX10-LABEL: test_min_max_K0_gt_K1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 2.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_K0_gt_K1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 4.0)
   %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 2.0)
@@ -211,16 +211,16 @@ define float @test_min_max_K0_gt_K1(float %a) #0 {
 define float @test_max_min_K0_gt_K1(float %a) #0 {
 ; GFX10-LABEL: test_max_min_K0_gt_K1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_K0_gt_K1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f32_e32 v0, 2.0, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call nnan float @llvm.minnum.f32(float %a, float 2.0)
   %fmed = call nnan float @llvm.maxnum.f32(float %minnum, float 4.0)
@@ -231,16 +231,16 @@ define float @test_max_min_K0_gt_K1(float %a) #0 {
 define float @test_min_max_non_inline_const(float %a) #0 {
 ; GFX10-LABEL: test_min_max_non_inline_const:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_non_inline_const:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f32_e32 v0, 2.0, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call nnan float @llvm.minnum.f32(float %maxnum, float 8.0)
@@ -252,16 +252,16 @@ define float @test_min_max_non_inline_const(float %a) #0 {
 define double @test_min_max_f64(double %a) #0 {
 ; GFX10-LABEL: test_min_max_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
 ; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], 2.0
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], 4.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan double @llvm.maxnum.f64(double %a, double 2.0)
   %fmed = call nnan double @llvm.minnum.f64(double %maxnum, double 4.0)
@@ -271,14 +271,13 @@ define double @test_min_max_f64(double %a) #0 {
 define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
 ; GFX10-LABEL: test_min_max_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v0, v0, 2.0 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_min_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; GFX8-NEXT:    v_max_f16_e32 v1, 2.0, v0
 ; GFX8-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -286,6 +285,7 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
 ; GFX8-NEXT:    v_min_f16_e32 v1, 4.0, v1
 ; GFX8-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
   %fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> %maxnum, <2 x half> <half 4.0, half 4.0>)
@@ -298,16 +298,16 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
 define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 ; GFX10-LABEL: test_min_max_maybe_NaN_input_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, 2.0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_min_max_maybe_NaN_input_ieee_false:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f32_e32 v0, 2.0, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %maxnum = call float @llvm.maxnum.f32(float %a, float 2.0)
   %fmed = call float @llvm.minnum.f32(float %maxnum, float 4.0)
@@ -319,16 +319,16 @@ define float @test_min_max_maybe_NaN_input_ieee_false(float %a) #1 {
 define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
 ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_false:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 2.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_false:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
   %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
@@ -339,18 +339,18 @@ define float @test_max_min_maybe_NaN_input_ieee_false(float %a) #1 {
 define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 ; GFX10-LABEL: test_max_min_maybe_NaN_input_ieee_true:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 2.0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_max_min_maybe_NaN_input_ieee_true:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; GFX8-NEXT:    v_max_f32_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %minnum = call float @llvm.minnum.f32(float %a, float 4.0)
   %fmed = call float @llvm.maxnum.f32(float %minnum, float 2.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
index 10e6b3f1b47b1de..c5d0a612d4797cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll
@@ -8,15 +8,15 @@
 define float @v_test_fmin_legacy_ole_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ole_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ole_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ole float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -26,15 +26,15 @@ define float @v_test_fmin_legacy_ole_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_olt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_olt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_olt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp olt float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -44,15 +44,15 @@ define float @v_test_fmin_legacy_olt_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_ule_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ule_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ule_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -62,15 +62,15 @@ define float @v_test_fmin_legacy_ule_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_ult_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ult_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ult_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ult float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -80,15 +80,15 @@ define float @v_test_fmin_legacy_ult_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_ogt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ogt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ogt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ogt float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -98,15 +98,15 @@ define float @v_test_fmin_legacy_ogt_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_oge_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_oge_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_oge_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp oge float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -116,15 +116,15 @@ define float @v_test_fmin_legacy_oge_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_uge_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_uge_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_uge_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nlt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -134,15 +134,15 @@ define float @v_test_fmin_legacy_uge_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_ugt_f32(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ugt_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ugt_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nle_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt float %a, %b
   %val = select i1 %cmp, float %b, float %a
@@ -152,15 +152,15 @@ define float @v_test_fmin_legacy_ugt_f32(float %a, float %b) {
 define float @v_test_fmin_legacy_ole_f32_fneg_lhs(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ole_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e64 v0, -v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ole_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f32_e64 s[4:5], -v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %a.neg = fneg float %a
   %cmp = fcmp ole float %a.neg, %b
@@ -171,15 +171,15 @@ define float @v_test_fmin_legacy_ole_f32_fneg_lhs(float %a, float %b) {
 define float @v_test_fmin_legacy_ole_f32_fneg_rhs(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ole_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e64 v0, v0, -v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ole_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f32_e64 s[4:5], v0, -v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %b.neg = fneg float %b
   %cmp = fcmp ole float %a, %b.neg
@@ -190,24 +190,22 @@ define float @v_test_fmin_legacy_ole_f32_fneg_rhs(float %a, float %b) {
 define float @v_test_fmin_legacy_ule_f32_multi_use(float %a, float %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ule_f32_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    ds_write_b32 v0, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ule_f32_multi_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ngt_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write_b32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule float %a, %b
   %val0 = select i1 %cmp, float %a, float %b
@@ -219,18 +217,18 @@ define float @v_test_fmin_legacy_ule_f32_multi_use(float %a, float %b) {
 define double @v_test_fmin_legacy_ole_f64(double %a, double %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ole_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ole_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ole double %a, %b
   %val = select i1 %cmp, double %a, double %b
@@ -240,16 +238,16 @@ define double @v_test_fmin_legacy_ole_f64(double %a, double %b) {
 define float @v_test_fcmp_select_oeq(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_oeq:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_oeq:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp oeq float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -259,16 +257,16 @@ define float @v_test_fcmp_select_oeq(float %a, float %b) {
 define float @v_test_fcmp_select_one(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_one:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_one:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp one float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -278,16 +276,16 @@ define float @v_test_fcmp_select_one(float %a, float %b) {
 define float @v_test_fcmp_select_ord(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_ord:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_ord:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ord float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -297,16 +295,16 @@ define float @v_test_fcmp_select_ord(float %a, float %b) {
 define float @v_test_fcmp_select_uno(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_uno:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_uno:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uno float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -316,16 +314,16 @@ define float @v_test_fcmp_select_uno(float %a, float %b) {
 define float @v_test_fcmp_select_ueq(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_ueq:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_ueq:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_nlg_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ueq float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -335,16 +333,16 @@ define float @v_test_fcmp_select_ueq(float %a, float %b) {
 define float @v_test_fcmp_select_une(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_une:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_une:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_neq_f32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp une float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -369,14 +367,14 @@ define float @v_test_fcmp_select_true(float %a, float %b) {
 define float @v_test_fcmp_select_false(float %a, float %b) {
 ; GFX6-LABEL: v_test_fcmp_select_false:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fcmp_select_false:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp false float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -386,18 +384,18 @@ define float @v_test_fcmp_select_false(float %a, float %b) {
 define <2 x float> @v_test_fmin_legacy_ole_v2f32(<2 x float> %a, <2 x float> %b) {
 ; GFX6-LABEL: v_test_fmin_legacy_ole_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_min_legacy_f32_e32 v1, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_test_fmin_legacy_ole_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_le_f32_e32 vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ole <2 x float> %a, %b
   %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index 543f8e413abd860..9b84b3bef1f97c4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -7,22 +7,22 @@
 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %a, %b
   ret <2 x half> %mul
@@ -31,23 +31,23 @@ define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v2f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %mul = fmul <2 x half> %neg.a, %b
@@ -57,23 +57,23 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v2f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %mul = fmul <2 x half> %a, %neg.b
@@ -83,22 +83,22 @@ define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %neg.b = fneg <2 x half> %b
@@ -109,25 +109,25 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b)
 define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
 ; GFX9-LABEL: v_fmul_v3f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v3f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <3 x half> %a, %b
   ret <3 x half> %mul
@@ -136,27 +136,27 @@ define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
 define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
 ; GFX9-LABEL: v_fmul_v3f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v3f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <3 x half> %a
   %mul = fmul <3 x half> %neg.a, %b
@@ -166,27 +166,27 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
 define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
 ; GFX9-LABEL: v_fmul_v3f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v3f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <3 x half> %b
   %mul = fmul <3 x half> %a, %neg.b
@@ -196,25 +196,25 @@ define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
 define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b) {
 ; GFX9-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <3 x half> %a
   %neg.b = fneg <3 x half> %b
@@ -225,27 +225,27 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b)
 define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
 ; GFX9-LABEL: v_fmul_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <4 x half> %a, %b
   ret <4 x half> %mul
@@ -254,14 +254,13 @@ define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
 define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
 ; GFX9-LABEL: v_fmul_v4f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
@@ -270,13 +269,14 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
 ; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <4 x half> %a
   %mul = fmul <4 x half> %neg.a, %b
@@ -286,14 +286,13 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
 define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
 ; GFX9-LABEL: v_fmul_v4f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v4f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
@@ -302,13 +301,14 @@ define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
 ; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <4 x half> %b
   %mul = fmul <4 x half> %a, %neg.b
@@ -318,27 +318,27 @@ define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
 define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b) {
 ; GFX9-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v2, v1, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <4 x half> %a
   %neg.b = fneg <4 x half> %b
@@ -349,15 +349,14 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b)
 define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
 ; GFX9-LABEL: v_fmul_v6f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v6f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
@@ -367,14 +366,15 @@ define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v6f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <6 x half> %a, %b
   ret <6 x half> %mul
@@ -383,15 +383,14 @@ define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
 define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
 ; GFX9-LABEL: v_fmul_v6f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
@@ -404,14 +403,15 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <6 x half> %a
   %mul = fmul <6 x half> %neg.a, %b
@@ -421,15 +421,14 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
 define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
 ; GFX9-LABEL: v_fmul_v6f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v6f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
@@ -442,14 +441,15 @@ define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v6f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <6 x half> %b
   %mul = fmul <6 x half> %a, %neg.b
@@ -459,15 +459,14 @@ define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
 define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b) {
 ; GFX9-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v6, v0, v3
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v3, v1, v4
@@ -477,14 +476,15 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v4
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <6 x half> %a
   %neg.b = fneg <6 x half> %b
@@ -495,16 +495,15 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b)
 define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
 ; GFX9-LABEL: v_fmul_v8f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v8f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
@@ -517,15 +516,16 @@ define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6
 ; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <8 x half> %a, %b
   ret <8 x half> %mul
@@ -534,16 +534,15 @@ define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
 define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
 ; GFX9-LABEL: v_fmul_v8f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
@@ -560,15 +559,16 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[1,0] neg_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <8 x half> %a
   %mul = fmul <8 x half> %neg.a, %b
@@ -578,16 +578,15 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
 define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
 ; GFX9-LABEL: v_fmul_v8f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v8f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80008000, v5
 ; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80008000, v6
@@ -604,15 +603,16 @@ define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <8 x half> %b
   %mul = fmul <8 x half> %a, %neg.b
@@ -622,16 +622,15 @@ define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
 define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b) {
 ; GFX9-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, v6
 ; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f16_e32 v8, v0, v4
 ; GFX8-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_f16_e32 v4, v1, v5
@@ -644,15 +643,16 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v4
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX10-NEXT:    v_pk_mul_f16 v2, v2, v6
 ; GFX10-NEXT:    v_pk_mul_f16 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <8 x half> %a
   %neg.b = fneg <8 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index dd5b07ee28bb64b..9f8ebf2750087ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -61,9 +61,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
 define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret float %ret
@@ -72,13 +71,13 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret float %ret
@@ -101,9 +100,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
 define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
@@ -126,9 +124,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
 define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
@@ -151,9 +148,8 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
 define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
@@ -175,9 +171,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret <2 x half> %ret
@@ -203,12 +198,12 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index dfa77337363e988..00535b788f799ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1234,16 +1234,14 @@ main_body:
 define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1253,7 +1251,6 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB44_1: ; %atomicrmw.start
@@ -1269,6 +1266,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1279,13 +1277,13 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1295,24 +1293,24 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1322,7 +1320,6 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB46_1: ; %atomicrmw.start
@@ -1338,23 +1335,25 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1364,16 +1363,14 @@ main_body:
 define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1383,16 +1380,14 @@ main_body:
 define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1561,7 +1556,6 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB53_1: ; %atomicrmw.start
@@ -1577,6 +1571,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB53_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1587,13 +1582,13 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
@@ -1603,24 +1598,24 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1630,7 +1625,6 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB55_1: ; %atomicrmw.start
@@ -1647,6 +1641,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB55_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1657,14 +1652,13 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1697,16 +1691,14 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
@@ -1780,16 +1772,14 @@ main_body:
 define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
@@ -1822,16 +1812,14 @@ main_body:
 define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
@@ -1866,20 +1854,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -1994,21 +1980,19 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data) #1 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2018,20 +2002,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2041,20 +2023,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[4:5]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 1c40f7992bfe14c..b0f3f9203ba5042 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -8,7 +8,6 @@
 define float @v_pow_f32(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -28,11 +27,11 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -52,11 +51,11 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -76,11 +75,11 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
@@ -94,11 +93,11 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -117,6 +116,7 @@ define float @v_pow_f32(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -125,7 +125,6 @@ define float @v_pow_f32(float %x, float %y) {
 define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6-LABEL: v_pow_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
@@ -158,11 +157,11 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
@@ -195,11 +194,11 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
@@ -232,11 +231,11 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
@@ -263,11 +262,11 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 1.0, 0x4f800000, vcc_lo
@@ -297,6 +296,7 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
   ret <2 x float> %pow
@@ -305,7 +305,6 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 define half @v_pow_f16(half %x, half %y) {
 ; GFX6-LABEL: v_pow_f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
@@ -320,44 +319,44 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_exp_f16_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -367,6 +366,7 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call half @llvm.pow.f16(half %x, half %y)
   ret half %pow
@@ -375,7 +375,6 @@ define half @v_pow_f16(half %x, half %y) {
 define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -401,11 +400,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f16_e32 v2, v0
 ; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -419,11 +418,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f16_e32 v2, v0
 ; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -437,11 +436,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f16_e32 v2, v0
 ; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -456,11 +455,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f16_e32 v2, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -482,6 +481,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
   ret <2 x half> %pow
@@ -490,7 +490,6 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -521,11 +520,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_log_f16_e32 v2, v0
 ; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -540,11 +539,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_log_f16_e32 v2, v0
 ; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -559,11 +558,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -579,11 +578,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -607,6 +606,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@@ -616,7 +616,6 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -647,11 +646,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f16_e32 v2, v0
 ; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
@@ -666,11 +665,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f16_e32 v2, v0
 ; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
@@ -685,11 +684,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f16_e32 v2, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX10-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -705,11 +704,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f16_e32 v2, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
@@ -732,6 +731,7 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@@ -741,7 +741,6 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -777,11 +776,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_log_f16_e32 v2, v0
 ; GFX8-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -797,11 +796,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX8-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_log_f16_e32 v2, v0
 ; GFX9-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -817,11 +816,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_exp_f16_e32 v1, v2
 ; GFX9-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX10-NEXT:    v_log_f16_e32 v2, v0
@@ -838,11 +837,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -867,6 +866,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y
@@ -883,7 +883,6 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -903,11 +902,11 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -927,11 +926,11 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -951,11 +950,11 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
@@ -969,11 +968,11 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -993,6 +992,7 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -1002,7 +1002,6 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1022,11 +1021,11 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1046,11 +1045,11 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1070,11 +1069,11 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
@@ -1088,11 +1087,11 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1111,6 +1110,7 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -1120,7 +1120,6 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -1140,11 +1139,11 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -1164,11 +1163,11 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -1188,11 +1187,11 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, |v0|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e64 v0, |v0|, v2
@@ -1206,11 +1205,11 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -1230,6 +1229,7 @@ define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
@@ -1578,7 +1578,6 @@ define amdgpu_ps float @v_pow_f32_sgpr_sgpr(float inreg %x, float inreg %y) {
 define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
@@ -1598,11 +1597,11 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
@@ -1622,11 +1621,11 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v2
@@ -1646,11 +1645,11 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, 0x800000, -v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v2
@@ -1664,11 +1663,11 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -1688,6 +1687,7 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %pow = call float @llvm.pow.f32(float %neg.x, float %y)
@@ -1697,7 +1697,6 @@ define float @v_pow_f32_fneg_lhs(float %x, float %y) {
 define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1717,11 +1716,11 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1741,11 +1740,11 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -1765,11 +1764,11 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
@@ -1783,11 +1782,11 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1806,6 +1805,7 @@ define float @v_pow_f32_fneg_rhs(float %x, float %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.y = fneg float %y
   %pow = call float @llvm.pow.f32(float %x, float %neg.y)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index f9b98059be0b3a4..0f943f36dce49dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -176,7 +176,6 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-LABEL: v_fshl_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -201,11 +200,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -231,11 +230,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -261,11 +260,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x7f, v1
@@ -291,11 +290,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7f, v1
@@ -331,6 +330,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
   ret i7 %result
@@ -402,7 +402,6 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-LABEL: v_fshl_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -410,11 +409,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
@@ -423,11 +422,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
@@ -436,11 +435,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -449,11 +448,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v3, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -465,6 +464,7 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt)
   ret i8 %result
@@ -521,47 +521,47 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
 define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshl_i8_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i8_4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i8_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i8_4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i8_4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
   ret i8 %result
@@ -618,47 +618,47 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
 define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshl_i8_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 3, 5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i8_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i8_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i8_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 5, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i8_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 3, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
   ret i8 %result
@@ -807,7 +807,6 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v2
@@ -829,11 +828,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
@@ -855,11 +854,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
@@ -881,11 +880,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
@@ -908,11 +907,11 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
@@ -940,6 +939,7 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -1219,7 +1219,6 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-LABEL: v_fshl_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
@@ -1266,11 +1265,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
@@ -1316,11 +1315,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
@@ -1365,11 +1364,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -1418,11 +1417,11 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -1475,6 +1474,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1646,7 +1646,6 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-LABEL: v_fshl_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1672,11 +1671,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1702,11 +1701,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1731,11 +1730,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v1, 1, 23
@@ -1759,11 +1758,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX11-NEXT:    v_bfe_u32 v1, v1, 1, 23
@@ -1796,6 +1795,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v2, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt)
   ret i24 %result
@@ -2417,7 +2417,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-LABEL: v_fshl_v2i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2460,11 +2459,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2507,11 +2506,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2552,11 +2551,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
@@ -2596,11 +2595,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v2, 1, 23
@@ -2654,6 +2653,7 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
   ret <2 x i24> %result
@@ -2795,21 +2795,21 @@ define amdgpu_ps i32 @s_fshl_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
 define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) {
 ; GCN-LABEL: v_fshl_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v1, v0, v1, 1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v0, v1, 1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt)
   ret i32 %result
@@ -2818,14 +2818,14 @@ define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) {
 define i32 @v_fshl_i32_5(i32 %lhs, i32 %rhs) {
 ; GCN-LABEL: v_fshl_i32_5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 27
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i32_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, 27
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5)
   ret i32 %result
@@ -2834,14 +2834,14 @@ define i32 @v_fshl_i32_5(i32 %lhs, i32 %rhs) {
 define i32 @v_fshl_i32_8(i32 %lhs, i32 %rhs) {
 ; GCN-LABEL: v_fshl_i32_8:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 24
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i32_8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, 24
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8)
   ret i32 %result
@@ -3000,7 +3000,6 @@ define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg
 define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX6-LABEL: v_fshl_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_alignbit_b32 v2, v0, v2, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_not_b32_e32 v4, v4
@@ -3009,11 +3008,11 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_not_b32_e32 v3, v5
 ; GFX6-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_alignbit_b32 v2, v0, v2, 1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
@@ -3022,11 +3021,11 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_not_b32_e32 v3, v5
 ; GFX8-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v2, v0, v2, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_not_b32_e32 v4, v4
@@ -3035,11 +3034,11 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, v5
 ; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v2, v0, v2, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_not_b32_e32 v4, v4
@@ -3048,11 +3047,11 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX10-NEXT:    v_not_b32_e32 v5, v5
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v2, v0, v2, 1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_not_b32_e32 v4, v4
@@ -3062,6 +3061,7 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
   ret <2 x i32> %result
@@ -3070,7 +3070,6 @@ define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX6-LABEL: v_fshl_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_alignbit_b32 v3, v0, v3, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_not_b32_e32 v6, v6
@@ -3083,11 +3082,11 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX6-NEXT:    v_not_b32_e32 v4, v8
 ; GFX6-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_alignbit_b32 v3, v0, v3, 1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_not_b32_e32 v6, v6
@@ -3100,11 +3099,11 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_not_b32_e32 v4, v8
 ; GFX8-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v3, v0, v3, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_not_b32_e32 v6, v6
@@ -3117,11 +3116,11 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_not_b32_e32 v4, v8
 ; GFX9-NEXT:    v_alignbit_b32 v2, v2, v3, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v3i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v3, v0, v3, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_not_b32_e32 v6, v6
@@ -3134,11 +3133,11 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v3, v0, v3, 1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_not_b32_e32 v6, v6
@@ -3152,6 +3151,7 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt)
   ret <3 x i32> %result
@@ -3160,7 +3160,6 @@ define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX6-LABEL: v_fshl_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_alignbit_b32 v4, v0, v4, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
@@ -3177,11 +3176,11 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_not_b32_e32 v5, v11
 ; GFX6-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_alignbit_b32 v4, v0, v4, 1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
@@ -3198,11 +3197,11 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_not_b32_e32 v5, v11
 ; GFX8-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v4, v0, v4, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
@@ -3219,11 +3218,11 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_not_b32_e32 v5, v11
 ; GFX9-NEXT:    v_alignbit_b32 v3, v3, v4, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v4, v0, v4, 1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_not_b32_e32 v8, v8
@@ -3240,11 +3239,11 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v4, v0, v4, 1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_not_b32_e32 v8, v8
@@ -3262,6 +3261,7 @@ define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
   ret <4 x i32> %result
@@ -3427,7 +3427,6 @@ define amdgpu_ps i16 @s_fshl_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
 define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX6-LABEL: v_fshl_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3437,11 +3436,11 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3449,11 +3448,11 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3461,11 +3460,11 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
@@ -3473,11 +3472,11 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
@@ -3487,6 +3486,7 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v3, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
   ret i16 %result
@@ -3495,43 +3495,43 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshl_i16_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 12, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i16_4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 12, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i16_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 12, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i16_4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 12, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i16_4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 12, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4)
   ret i16 %result
@@ -3540,43 +3540,43 @@ define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) {
 define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshl_i16_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 11, 5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i16_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 11, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i16_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 5, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 11, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i16_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 5, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 11, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i16_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 5, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 11, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5)
   ret i16 %result
@@ -3911,7 +3911,6 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshl_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 15, v4
@@ -3930,11 +3929,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
@@ -3954,11 +3953,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -3966,11 +3965,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
@@ -3978,11 +3977,11 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1]
@@ -3992,6 +3991,7 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
   ret <2 x i16> %result
@@ -4000,7 +4000,6 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_fshl_v2i16_4_8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 11, v2
@@ -4009,11 +4008,11 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i16_4_8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 12, v1
@@ -4025,33 +4024,34 @@ define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i16_4_8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x80004
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x8000c
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i16_4_8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 0x80004, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 0x8000c, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i16_4_8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 0x80004, v0
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 0x8000c, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> <i16 4, i16 8>)
   ret <2 x i16> %result
@@ -4590,7 +4590,6 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
 ; GFX6-LABEL: v_fshl_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 15, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v6
 ; GFX6-NEXT:    v_and_b32_e32 v6, 15, v6
@@ -4618,11 +4617,11 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX8-NEXT:    v_and_b32_e32 v7, 15, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
@@ -4650,11 +4649,11 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4669,11 +4668,11 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4688,11 +4687,11 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4709,6 +4708,7 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
   %cast.result = bitcast <3 x i16> %result to <3 x half>
@@ -4945,7 +4945,6 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) {
 ; GFX6-LABEL: v_fshl_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v12, 15, v8
 ; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v8
 ; GFX6-NEXT:    v_and_b32_e32 v8, 15, v8
@@ -4982,11 +4981,11 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX8-NEXT:    v_and_b32_e32 v8, 15, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
@@ -5024,11 +5023,11 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5043,11 +5042,11 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5062,11 +5061,11 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5083,6 +5082,7 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
   %cast.result = bitcast <4 x i16> %result to <4 x half>
@@ -5182,7 +5182,6 @@ define amdgpu_ps i64 @s_fshl_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshl_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
@@ -5191,11 +5190,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
@@ -5204,11 +5203,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
@@ -5217,11 +5216,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v5, v4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
@@ -5230,11 +5229,11 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v4
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_and_b32_e32 v4, 63, v4
@@ -5246,6 +5245,7 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt)
   ret i64 %result
@@ -5254,43 +5254,43 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 define i64 @v_fshl_i64_5(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_fshl_i64_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 5
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i64_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i64_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i64_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i64_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 5, v[0:1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 27, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -5299,15 +5299,15 @@ define i64 @v_fshl_i64_5(i64 %lhs, i64 %rhs) {
 define i64 @v_fshl_i64_32(i64 %lhs, i64 %rhs) {
 ; GCN-LABEL: v_fshl_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i64_32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -5316,45 +5316,45 @@ define i64 @v_fshl_i64_32(i64 %lhs, i64 %rhs) {
 define i64 @v_fshl_i64_48(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_fshl_i64_48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[2:3], 16
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i64_48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i64_48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i64_48:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i64_48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 16, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5640,7 +5640,6 @@ define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-LABEL: v_fshl_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], 1
@@ -5657,11 +5656,11 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
@@ -5678,11 +5677,11 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
@@ -5699,11 +5698,11 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
 ; GFX10-NEXT:    v_not_b32_e32 v11, v10
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
@@ -5720,11 +5719,11 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
 ; GFX11-NEXT:    v_not_b32_e32 v11, v10
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
@@ -5745,6 +5744,7 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
   ret <2 x i64> %result
@@ -5987,7 +5987,6 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshl_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_and_b32_e32 v15, 0x7f, v8
@@ -6031,11 +6030,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v11, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v12, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0x7f, v8
@@ -6079,11 +6078,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v11, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v12, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0x7f, v8
@@ -6126,11 +6125,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v1, v11, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v12, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v18, 0x7f, v8
 ; GFX10-NEXT:    v_not_b32_e32 v8, v8
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
@@ -6173,11 +6172,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[4:5]
 ; GFX11-NEXT:    v_and_b32_e32 v18, 0x7f, v8
 ; GFX11-NEXT:    v_not_b32_e32 v8, v8
@@ -6224,6 +6223,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -7095,51 +7095,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-LABEL: v_fshl_i128_65:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_i128_65:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_i128_65:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
@@ -7147,6 +7146,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -7594,7 +7594,6 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) {
 ; GFX6-LABEL: v_fshl_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_and_b32_e32 v24, 0x7f, v16
@@ -7681,11 +7680,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshl_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0x7f, v16
@@ -7772,11 +7771,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshl_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0x7f, v16
@@ -7861,11 +7860,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v18, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v20, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0x7f, v16
 ; GFX10-NEXT:    v_not_b32_e32 v16, v16
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
@@ -7950,11 +7949,11 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v27, 0x7f, v16
 ; GFX11-NEXT:    v_not_b32_e32 v16, v16
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], 1, v[8:9]
@@ -8046,6 +8045,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
 ; GFX11-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index c8455665e7b40f1..650584b19fb8904 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -172,7 +172,6 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-LABEL: v_fshr_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -198,11 +197,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -228,11 +227,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7f, v2
@@ -258,11 +257,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
@@ -288,11 +287,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, 7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7f, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
@@ -328,6 +327,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
   ret i7 %result
@@ -400,7 +400,6 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
 define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-LABEL: v_fshr_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -409,11 +408,11 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -421,11 +420,11 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 7, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
@@ -433,11 +432,11 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v3, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
@@ -446,11 +445,11 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v3, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
@@ -461,6 +460,7 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt)
   ret i8 %result
@@ -517,47 +517,47 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
 define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshr_i8_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i8_4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i8_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i8_4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i8_4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4)
   ret i8 %result
@@ -614,47 +614,47 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) {
 define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_fshr_i8_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 5, 3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i8_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 3, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i8_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i8_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 3, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 5, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i8_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 5, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5)
   ret i8 %result
@@ -804,7 +804,6 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-LABEL: v_fshr_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX6-NEXT:    v_and_b32_e32 v5, 7, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v2
@@ -827,11 +826,11 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX8-NEXT:    v_not_b32_e32 v2, v2
@@ -852,11 +851,11 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
@@ -877,11 +876,11 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
@@ -904,11 +903,11 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
@@ -937,6 +936,7 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -1218,7 +1218,6 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-LABEL: v_fshr_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
@@ -1267,11 +1266,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
@@ -1317,11 +1316,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
@@ -1366,11 +1365,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_not_b32_e32 v8, v2
@@ -1419,11 +1418,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -1476,6 +1475,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1652,7 +1652,6 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-LABEL: v_fshr_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1679,11 +1678,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1710,11 +1709,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffffe8
@@ -1740,11 +1739,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
@@ -1769,11 +1768,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, 24
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
@@ -1807,6 +1806,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v3, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt)
   ret i24 %result
@@ -2427,7 +2427,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-LABEL: v_fshr_v2i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2472,11 +2471,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2521,11 +2520,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
@@ -2568,11 +2567,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v1, v4, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
@@ -2614,11 +2613,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -2673,6 +2672,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
   ret <2 x i24> %result
@@ -2800,14 +2800,14 @@ define amdgpu_ps i32 @s_fshr_i32_8(i32 inreg %lhs, i32 inreg %rhs) {
 define i32 @v_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt) {
 ; GCN-LABEL: v_fshr_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt)
   ret i32 %result
@@ -2816,14 +2816,14 @@ define i32 @v_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt) {
 define i32 @v_fshr_i32_5(i32 %lhs, i32 %rhs) {
 ; GCN-LABEL: v_fshr_i32_5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i32_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, 5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5)
   ret i32 %result
@@ -2832,14 +2832,14 @@ define i32 @v_fshr_i32_5(i32 %lhs, i32 %rhs) {
 define i32 @v_fshr_i32_8(i32 %lhs, i32 %rhs) {
 ; GCN-LABEL: v_fshr_i32_8:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i32_8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, 8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8)
   ret i32 %result
@@ -2953,16 +2953,16 @@ define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg
 define <2 x i32> @v_fshr_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 ; GCN-LABEL: v_fshr_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt)
   ret <2 x i32> %result
@@ -2971,18 +2971,18 @@ define <2 x i32> @v_fshr_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) {
 define <3 x i32> @v_fshr_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 ; GCN-LABEL: v_fshr_v3i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GCN-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt)
   ret <3 x i32> %result
@@ -2991,20 +2991,20 @@ define <3 x i32> @v_fshr_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) {
 define <4 x i32> @v_fshr_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) {
 ; GCN-LABEL: v_fshr_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v0, v4, v8
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GCN-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GCN-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v4, v8
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX11-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt)
   ret <4 x i32> %result
@@ -3171,7 +3171,6 @@ define amdgpu_ps i16 @s_fshr_i16_5(i16 inreg %lhs, i16 inreg %rhs) {
 define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX6-LABEL: v_fshr_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3182,11 +3181,11 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3194,11 +3193,11 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 15, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3206,11 +3205,11 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v2, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v3, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3218,11 +3217,11 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
@@ -3232,6 +3231,7 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
   ret i16 %result
@@ -3240,43 +3240,43 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
 define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshr_i16_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 4, 12
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i16_4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i16_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 4, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i16_4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i16_4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 12, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 4, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4)
   ret i16 %result
@@ -3285,43 +3285,43 @@ define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) {
 define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_fshr_i16_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 11, v0
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 5, 11
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i16_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 11, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 5, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i16_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 11, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 5, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i16_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 11, v0
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 5, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i16_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 11, v0
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 5, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5)
   ret i16 %result
@@ -3676,7 +3676,6 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_or_b32_e32 v4, v5, v4
@@ -3710,11 +3709,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v1
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -3744,11 +3743,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -3756,11 +3755,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -3768,11 +3767,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -3782,6 +3781,7 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt)
   ret <2 x i16> %result
@@ -3790,7 +3790,6 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) {
 define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_fshr_v2i16_4_8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 15
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 12, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 3, v2
@@ -3799,11 +3798,11 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 7, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i16_4_8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 4, v1
@@ -3815,33 +3814,34 @@ define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i16_4_8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x8000c
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x80004
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i16_4_8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 0x8000c, v0
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 0x80004, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i16_4_8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 0x8000c, v0
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 0x80004, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> <i16 4, i16 8>)
   ret <2 x i16> %result
@@ -4483,7 +4483,6 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
@@ -4534,11 +4533,11 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 15, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -4581,11 +4580,11 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4600,11 +4599,11 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v4, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v2, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4619,11 +4618,11 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -4640,6 +4639,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
   %cast.result = bitcast <3 x i16> %result to <3 x half>
@@ -4922,7 +4922,6 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) {
 ; GFX6-LABEL: v_fshr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_or_b32_e32 v8, v9, v8
@@ -4989,11 +4988,11 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 15, v2
 ; GFX8-NEXT:    v_or_b32_e32 v6, v6, v7
@@ -5050,11 +5049,11 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xf000f, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5069,11 +5068,11 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v4, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v2, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5088,11 +5087,11 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v6, -1, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v5
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xf000f, v4
@@ -5109,6 +5108,7 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
   %cast.result = bitcast <4 x i16> %result to <4 x half>
@@ -5238,7 +5238,6 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-LABEL: v_fshr_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX6-NEXT:    v_not_b32_e32 v4, v4
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
@@ -5247,11 +5246,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX8-NEXT:    v_not_b32_e32 v4, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5260,11 +5259,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
 ; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5273,11 +5272,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v5, v4
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
@@ -5286,11 +5285,11 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v5, v4
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX11-NEXT:    v_and_b32_e32 v4, 63, v4
@@ -5302,6 +5301,7 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt)
   ret i64 %result
@@ -5310,45 +5310,45 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) {
 define i64 @v_fshr_i64_5(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_fshr_i64_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[2:3], 5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i64_5:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 27, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i64_5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 27, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i64_5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 27, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i64_5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 5, v[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 27, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -5357,15 +5357,15 @@ define i64 @v_fshr_i64_5(i64 %lhs, i64 %rhs) {
 define i64 @v_fshr_i64_32(i64 %lhs, i64 %rhs) {
 ; GCN-LABEL: v_fshr_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i64_32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -5374,40 +5374,40 @@ define i64 @v_fshr_i64_32(i64 %lhs, i64 %rhs) {
 define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_fshr_i64_48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 16
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i64_48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i64_48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i64_48:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i64_48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5693,7 +5693,6 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %
 define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-LABEL: v_fshr_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
@@ -5710,11 +5709,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5731,11 +5730,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5752,11 +5751,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
 ; GFX10-NEXT:    v_not_b32_e32 v11, v10
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5773,11 +5772,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
 ; GFX11-NEXT:    v_not_b32_e32 v11, v10
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -5798,6 +5797,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt)
   ret <2 x i64> %result
@@ -6040,7 +6040,6 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-LABEL: v_fshr_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX6-NEXT:    v_not_b32_e32 v8, v8
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
@@ -6084,11 +6083,11 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v13, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX8-NEXT:    v_not_b32_e32 v8, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
@@ -6132,11 +6131,11 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v13, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0x7f, v8
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
@@ -6180,11 +6179,11 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX9-NEXT:    v_or_b32_e32 v1, v13, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
@@ -6228,11 +6227,11 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX10-NEXT:    v_or_b32_e32 v1, v7, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 31, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
@@ -6280,6 +6279,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt)
   ret i128 %result
@@ -7154,51 +7154,50 @@ define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-LABEL: v_fshr_i128_65:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 31, v2
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_i128_65:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 31, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 31, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i128_65:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 31, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 31, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX10-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v0, 31, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 31, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX11-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[0:1]
@@ -7207,6 +7206,7 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 31, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -7654,7 +7654,6 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) {
 ; GFX6-LABEL: v_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX6-NEXT:    v_not_b32_e32 v16, v16
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
@@ -7741,11 +7740,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX6-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    v_or_b32_e32 v6, v10, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX8-NEXT:    v_not_b32_e32 v16, v16
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
@@ -7832,11 +7831,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX8-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    v_or_b32_e32 v6, v10, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0x7f, v16
 ; GFX9-NEXT:    v_not_b32_e32 v16, v16
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
@@ -7923,11 +7922,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX9-NEXT:    v_or_b32_e32 v5, v17, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v10, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_not_b32_e32 v17, v16
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_and_b32_e32 v26, 0x7f, v16
@@ -8014,11 +8013,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX10-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX10-NEXT:    v_or_b32_e32 v6, v6, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_not_b32_e32 v17, v16
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
@@ -8115,6 +8114,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v5, v14, v8
 ; GFX11-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index 10cbc56cc5fbea1..20af7ca68a88371 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -10,7 +10,7 @@
 define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-LABEL: waterfall_loop:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b32 s4, -1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
@@ -191,7 +191,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ret = tail call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %vgpr_srd, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
index 6515d25f7415711..df4a6fee7196407 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inline-asm.ll
@@ -4,7 +4,6 @@
 define i32 @test_sgpr_reg_class_constraint() nounwind {
 ; CHECK-LABEL: test_sgpr_reg_class_constraint:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s4, 7
 ; CHECK-NEXT:    ;;#ASMEND
@@ -15,6 +14,7 @@ define i32 @test_sgpr_reg_class_constraint() nounwind {
 ; CHECK-NEXT:    s_add_u32 s4, s4, s5
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
@@ -26,7 +26,6 @@ entry:
 define i32 @test_sgpr_matching_constraint() nounwind {
 ; CHECK-LABEL: test_sgpr_matching_constraint:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s5, 7
 ; CHECK-NEXT:    ;;#ASMEND
@@ -37,6 +36,7 @@ define i32 @test_sgpr_matching_constraint() nounwind {
 ; CHECK-NEXT:    s_add_u32 s4, s5, s4
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
@@ -48,13 +48,13 @@ entry:
 define i32 @test_sgpr_to_vgpr_move_reg_class_constraint() nounwind {
 ; CHECK-LABEL: test_sgpr_to_vgpr_move_reg_class_constraint:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s4, 7
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    v_mov_b32 v0, s4
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
@@ -65,7 +65,6 @@ entry:
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
 ; CHECK-LABEL: test_sgpr_to_vgpr_move_matching_constraint:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    s_mov_b32 s4, 7
 ; CHECK-NEXT:    ;;#ASMEND
@@ -73,6 +72,7 @@ define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    v_mov_b32 v0, v0
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %asm0 = tail call i32 asm "s_mov_b32 $0, 7", "=s"() nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index dc9cbb498dab4d8..34fc573c027ecb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -88,7 +88,6 @@ entry:
 define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v8f32_const_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v8, 1.0, v0, vcc
 ; GPRIDX-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
@@ -112,11 +111,11 @@ define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
 ; GPRIDX-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v8
 ; GPRIDX-NEXT:    v_mov_b32_e32 v1, v9
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_insertelement_v8f32_const_s_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, 1.0, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
@@ -135,11 +134,11 @@ define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0x41000000, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_insertelement_v8f32_const_s_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, 1.0, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
@@ -157,6 +156,7 @@ define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) {
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 7, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX11-NEXT:    v_dual_cndmask_b32 v7, 0x41000000, v0 :: v_dual_mov_b32 v0, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %insert = insertelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, float %val, i32 %idx
@@ -685,7 +685,6 @@ entry:
 define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GPRIDX:       ; %bb.0: ; %entry
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_mov_b32 s18, 0
 ; GPRIDX-NEXT:    s_mov_b32 s16, 0
 ; GPRIDX-NEXT:    s_mov_b32 s14, 0
@@ -746,12 +745,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
+; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s18, 0
 ; GFX10-NEXT:    s_mov_b32 s16, 0
 ; GFX10-NEXT:    s_mov_b32 s14, 0
@@ -813,11 +811,11 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: dyn_insertelement_v8f64_const_s_v_v:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s14, 0
 ; GFX11-NEXT:    s_mov_b32 s15, 0x40200000
 ; GFX11-NEXT:    s_mov_b32 s12, 0
@@ -867,6 +865,7 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[15:18], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %insert = insertelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, double %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a96f..e6fa482ad991436 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -9,52 +9,52 @@
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_div_fmas_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W32-LABEL: v_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W32-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
+; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
   ret float %result
@@ -63,52 +63,52 @@ define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX7-NEXT:    s_nop 3
 ; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_div_fmas_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX8-NEXT:    s_nop 3
 ; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W32-LABEL: v_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_W64-LABEL: v_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W32-LABEL: v_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W32-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX11_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11_W64-LABEL: v_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX11_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
   ret double %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
index 03ed6837291c889..c76305fafa5dbcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll
@@ -145,24 +145,21 @@ define amdgpu_ps void @ds_fadd_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr,
 define float @ds_fadd_f32_vv(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fadd_f32_vv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fadd_f32_vv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fadd_f32_vv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
   ret float %ret
@@ -171,24 +168,21 @@ define float @ds_fadd_f32_vv(ptr addrspace(3) %ptr, float %val) {
 define float @ds_fadd_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fadd_f32_vv_offset:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fadd_f32_vv_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
@@ -198,24 +192,21 @@ define float @ds_fadd_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
 define void @ds_fadd_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fadd_f32_vv_nortn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_add_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fadd_f32_vv_nortn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fadd_f32_vv_nortn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_add_f32 v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
   ret void
@@ -224,24 +215,21 @@ define void @ds_fadd_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
 define void @ds_fadd_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fadd_f32_vv_offset_nortn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fadd_f32_vv_offset_nortn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset_nortn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_add_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
@@ -251,24 +239,21 @@ define void @ds_fadd_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
 define float @ds_fadd_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fadd_f32_vv_volatile:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fadd_f32_vv_volatile:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fadd_f32_vv_volatile:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fadd(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
index 0f6fb5acd56adac..2dd1396ad7ba2d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll
@@ -145,24 +145,21 @@ define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(ptr addrspace(3) inreg %ptr,
 define float @ds_fmin_f32_vv(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fmin_f32_vv:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fmin_f32_vv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fmin_f32_vv:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
   ret float %ret
@@ -171,24 +168,21 @@ define float @ds_fmin_f32_vv(ptr addrspace(3) %ptr, float %val) {
 define float @ds_fmin_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fmin_f32_vv_offset:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fmin_f32_vv_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
@@ -198,24 +192,21 @@ define float @ds_fmin_f32_vv_offset(ptr addrspace(3) %ptr, float %val) {
 define void @ds_fmin_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fmin_f32_vv_nortn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_min_f32 v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fmin_f32_vv_nortn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fmin_f32_vv_nortn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_min_f32 v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
   ret void
@@ -224,24 +215,21 @@ define void @ds_fmin_f32_vv_nortn(ptr addrspace(3) %ptr, float %val) {
 define void @ds_fmin_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fmin_f32_vv_offset_nortn:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fmin_f32_vv_offset_nortn:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset_nortn:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_min_f32 v0, v1 offset:512
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(3) %ptr, i32 128
   %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %gep, float %val, i32 0, i32 0, i1 false)
@@ -251,24 +239,21 @@ define void @ds_fmin_f32_vv_offset_nortn(ptr addrspace(3) %ptr, float %val) {
 define float @ds_fmin_f32_vv_volatile(ptr addrspace(3) %ptr, float %val) {
 ; GFX8-LABEL: ds_fmin_f32_vv_volatile:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ds_fmin_f32_vv_volatile:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: ds_fmin_f32_vv_volatile:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    ds_min_rtn_f32 v0, v0, v1
-; GFX10PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 9cdb9151e4dfbf1..60ac2d3d5e40c71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -7,8 +7,8 @@
 define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   ret float %r
@@ -17,14 +17,14 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_fdot2_clamp:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
   ret float %r
@@ -33,8 +33,8 @@ define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_neg_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
@@ -44,8 +44,8 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_neg_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
@@ -55,8 +55,8 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_neg_a_neg_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %b
   %neg.b = fneg <2 x half> %b
@@ -67,9 +67,9 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_neg_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
@@ -79,8 +79,8 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
 define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   ret float %ret
@@ -89,8 +89,8 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
 define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   ret float %ret
@@ -99,8 +99,8 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
 define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
 ; GFX906-LABEL: v_fdot2_inline_literal_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_f32_f16 v0, v0, v1, 1.0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
index 7a89270f3546e63..23794314ebdaa70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll
@@ -9,38 +9,38 @@
 define float @v_mul_legacy_f32(float %a, float %b) {
 ; GFX6-LABEL: v_mul_legacy_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   ret float %result
@@ -49,38 +49,38 @@ define float @v_mul_legacy_f32(float %a, float %b) {
 define float @v_mul_legacy_undef0_f32(float %a) {
 ; GFX6-LABEL: v_mul_legacy_undef0_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_undef0_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_undef0_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_undef0_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_undef0_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_undef0_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
   ret float %result
@@ -89,38 +89,38 @@ define float @v_mul_legacy_undef0_f32(float %a) {
 define float @v_mul_legacy_undef1_f32(float %a) {
 ; GFX6-LABEL: v_mul_legacy_undef1_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_undef1_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_undef1_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_undef1_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_undef1_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, s4, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_undef1_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, s0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
   ret float %result
@@ -129,38 +129,38 @@ define float @v_mul_legacy_undef1_f32(float %a) {
 define float @v_mul_legacy_undef_f32() {
 ; GFX6-LABEL: v_mul_legacy_undef_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e64 v0, s4, s4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_undef_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e64 v0, s4, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_undef_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e64 v0, s4, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_undef_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e64 v0, s4, s4
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_undef_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e64 v0, s4, s4
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_undef_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, s0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float undef)
   ret float %result
@@ -169,38 +169,38 @@ define float @v_mul_legacy_undef_f32() {
 define float @v_mul_legacy_fabs_f32(float %a, float %b) {
 ; GFX6-LABEL: v_mul_legacy_fabs_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e64 v0, |v0|, |v1|
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_fabs_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e64 v0, |v0|, |v1|
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_fabs_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e64 v0, |v0|, |v1|
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_fabs_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e64 v0, |v0|, |v1|
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_fabs_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e64 v0, |v0|, |v1|
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_fabs_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, |v0|, |v1|
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
@@ -211,38 +211,38 @@ define float @v_mul_legacy_fabs_f32(float %a, float %b) {
 define float @v_mul_legacy_fneg_f32(float %a, float %b) {
 ; GFX6-LABEL: v_mul_legacy_fneg_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_fneg_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_fneg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_fneg_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_fneg_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_fneg_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, -v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
@@ -254,45 +254,45 @@ define float @v_mul_legacy_fneg_f32(float %a, float %b) {
 define float @v_add_mul_legacy_f32(float %a, float %b, float %c) {
 ; GFX6-LABEL: v_add_mul_legacy_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_mul_legacy_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_mul_legacy_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_add_mul_legacy_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX101-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_add_mul_legacy_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX103-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_add_mul_legacy_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
@@ -302,41 +302,41 @@ define float @v_add_mul_legacy_f32(float %a, float %b, float %c) {
 define float @v_mad_legacy_f32(float %a, float %b, float %c) #2 {
 ; GFX6-LABEL: v_mad_legacy_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mad_legacy_f32 v0, v0, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mad_legacy_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_legacy_f32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_legacy_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_f32 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mad_legacy_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mad_legacy_f32 v0, v0, v1, v2
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mad_legacy_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX103-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mad_legacy_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
@@ -346,41 +346,41 @@ define float @v_mad_legacy_f32(float %a, float %b, float %c) #2 {
 define float @v_mad_legacy_fneg_f32(float %a, float %b, float %c) #2 {
 ; GFX6-LABEL: v_mad_legacy_fneg_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mad_legacy_f32 v0, -v0, -v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mad_legacy_fneg_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_legacy_f32 v0, -v0, -v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_legacy_fneg_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_f32 v0, -v0, -v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mad_legacy_fneg_f32:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mad_legacy_f32 v0, -v0, -v1, v2
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mad_legacy_fneg_f32:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e64 v0, -v0, -v1
 ; GFX103-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mad_legacy_fneg_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, -v0, -v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
@@ -429,38 +429,38 @@ define amdgpu_ps float @s_mul_legacy_f32(float inreg %a, float inreg %b) {
 define float @v_mul_legacy_f32_1.0(float %a) {
 ; GFX6-LABEL: v_mul_legacy_f32_1.0:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_f32_1.0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_f32_1.0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_f32_1.0:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_f32_1.0:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_f32_1.0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float 1.0)
   ret float %result
@@ -469,38 +469,38 @@ define float @v_mul_legacy_f32_1.0(float %a) {
 define float @v_mul_legacy_f32_1.0_swap(float %b) {
 ; GFX6-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, 1.0, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_f32_1.0_swap:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float 1.0, float %b)
   ret float %result
@@ -509,38 +509,38 @@ define float @v_mul_legacy_f32_1.0_swap(float %b) {
 define float @v_mul_legacy_f32_2.0(float %a) {
 ; GFX6-LABEL: v_mul_legacy_f32_2.0:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_f32_2.0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_f32_2.0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_f32_2.0:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_f32_2.0:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_f32_2.0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 2.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float 2.0)
   ret float %result
@@ -549,38 +549,38 @@ define float @v_mul_legacy_f32_2.0(float %a) {
 define float @v_mul_legacy_f32_2.0_swap(float %b) {
 ; GFX6-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX101-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX101:       ; %bb.0:
-; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX101-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX101-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX103-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX103:       ; %bb.0:
-; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    v_mul_legacy_f32_e32 v0, 2.0, v0
+; GFX103-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX103-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_legacy_f32_2.0_swap:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, 2.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.amdgcn.fmul.legacy(float 2.0, float %b)
   ret float %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 4ab963ed85ccada..4ee9c7173d2467a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -6,16 +6,14 @@
 define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_atomic_csub:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret i32 %ret
@@ -24,20 +22,18 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
 define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_atomic_csub_offset:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
@@ -47,16 +43,14 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_nortn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_atomic_csub_nortn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
   ret void
@@ -65,20 +59,18 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
 define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset_nortn:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_atomic_csub_offset_nortn:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
   %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
index 752ddbb896c6b1b..ec5d12d11062cde 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll
@@ -5,16 +5,14 @@
 define void @global_atomic_fadd_f32(ptr addrspace(1) %ptr, float %data) {
 ; GFX908-LABEL: global_atomic_fadd_f32:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_atomic_fadd_f32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
   ret void
@@ -23,16 +21,14 @@ define void @global_atomic_fadd_f32(ptr addrspace(1) %ptr, float %data) {
 define void @global_atomic_fadd_f32_off_2048(ptr addrspace(1) %ptr, float %data) {
 ; GFX908-LABEL: global_atomic_fadd_f32_off_2048:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_atomic_fadd_f32_off_2048:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:2048
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data)
@@ -42,16 +38,14 @@ define void @global_atomic_fadd_f32_off_2048(ptr addrspace(1) %ptr, float %data)
 define void @global_atomic_fadd_f32_off_neg2047(ptr addrspace(1) %ptr, float %data) {
 ; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2044
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f32 v[0:1], v2, off offset:-2044
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -511
   %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data)
@@ -86,16 +80,14 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr,
 define void @global_atomic_fadd_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
 ; GFX908-LABEL: global_atomic_fadd_v2f16:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_atomic_fadd_v2f16:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
   ret void
@@ -104,16 +96,14 @@ define void @global_atomic_fadd_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
 define void @global_atomic_fadd_v2f16_off_neg2047(ptr addrspace(1) %ptr, <2 x half> %data) {
 ; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -511
   %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %gep, <2 x half> %data)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
index bd570df3d83b3f6..41839479c985a27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -5,16 +5,16 @@
 define float @v_rsq_clamp_f32(float %src) #0 {
 ; SI-LABEL: v_rsq_clamp_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f32_e32 v0, v0
 ; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
 ; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   ret float %rsq_clamp
@@ -23,16 +23,16 @@ define float @v_rsq_clamp_f32(float %src) #0 {
 define float @v_rsq_clamp_fabs_f32(float %src) #0 {
 ; SI-LABEL: v_rsq_clamp_fabs_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f32_e64 v0, |v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_fabs_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f32_e64 v0, |v0|
 ; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
 ; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
@@ -42,19 +42,19 @@ define float @v_rsq_clamp_fabs_f32(float %src) #0 {
 define double @v_rsq_clamp_f64(double %src) #0 {
 ; SI-LABEL: v_rsq_clamp_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
 ; VI-NEXT:    s_mov_b32 s4, -1
 ; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
 ; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0xffefffff
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
@@ -63,19 +63,19 @@ define double @v_rsq_clamp_f64(double %src) #0 {
 define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 ; SI-LABEL: v_rsq_clamp_fabs_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_fabs_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
 ; VI-NEXT:    s_mov_b32 s4, -1
 ; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
 ; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0xffefffff
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
@@ -85,16 +85,16 @@ define double @v_rsq_clamp_fabs_f64(double %src) #0 {
 define float @v_rsq_clamp_undef_f32() #0 {
 ; SI-LABEL: v_rsq_clamp_undef_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f32_e32 v0, s4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_undef_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f32_e32 v0, s4
 ; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
 ; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
   ret float %rsq_clamp
@@ -103,19 +103,19 @@ define float @v_rsq_clamp_undef_f32() #0 {
 define double @v_rsq_clamp_undef_f64() #0 {
 ; SI-LABEL: v_rsq_clamp_undef_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_undef_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s4, -1
 ; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
 ; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0xffefffff
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
   ret double %rsq_clamp
@@ -124,16 +124,16 @@ define double @v_rsq_clamp_undef_f64() #0 {
 define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
 ; SI-LABEL: v_rsq_clamp_f32_non_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_f32_non_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f32_e32 v0, v0
 ; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
 ; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   ret float %rsq_clamp
@@ -142,19 +142,19 @@ define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
 define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
 ; SI-LABEL: v_rsq_clamp_f64_non_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], v[0:1]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rsq_clamp_f64_non_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
 ; VI-NEXT:    s_mov_b32 s4, -1
 ; VI-NEXT:    s_mov_b32 s5, 0x7fefffff
 ; VI-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0xffefffff
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   ret double %rsq_clamp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
index 70d51faa51c0dea..0bf0d797d5ed093 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll
@@ -1437,39 +1437,39 @@ define amdgpu_kernel void @test_setreg_set_4_bits_straddles_round_and_denorm() {
 define void @test_setreg_roundingmode_var_vgpr(i32 %var.mode) {
 ; GFX6-LABEL: test_setreg_roundingmode_var_vgpr:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e]
 ; GFX6-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x84,0xb9]
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX789-LABEL: test_setreg_roundingmode_var_vgpr:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX789-NEXT:    v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e]
 ; GFX789-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x04,0xb9]
 ; GFX789-NEXT:    ;;#ASMSTART
 ; GFX789-NEXT:    ;;#ASMEND
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX789-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; GFX10-LABEL: test_setreg_roundingmode_var_vgpr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v0 ; encoding: [0x00,0x05,0x08,0x7e]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x84,0xb9]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: test_setreg_roundingmode_var_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
   call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode)
   call void asm sideeffect "", ""()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 6eed92ba1d71ccc..1855b32b1f4980b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -4,8 +4,8 @@
 define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
 ; GFX6-LABEL: v_bfe_i32_arg_arg_arg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2)
   ret i32 %bfe_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index 7ee4e8be754e29f..28d0a2895565010 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -7,21 +7,21 @@
 define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -30,20 +30,20 @@ define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_clamp:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_clamp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
   ret i32 %r
@@ -77,21 +77,21 @@ define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inr
 define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v1, 0x40004, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -100,21 +100,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v1, 0x40004, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -123,22 +123,22 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0x40004
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v0, 0x80008, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -147,22 +147,22 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 define i32 @v_sdot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0x40004
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 8
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v0, 0x80008, v1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
   ret i32 %r
@@ -171,22 +171,22 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
 define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
 ; GFX906-LABEL: v_sdot2_inline_literal_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, 7
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_inline_literal_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, 7
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_inline_literal_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, 7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
   ret i32 %r
@@ -195,22 +195,22 @@ define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
 define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_fneg_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fneg_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fneg_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
@@ -221,22 +221,22 @@ define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_fneg_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fneg_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fneg_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
@@ -247,24 +247,24 @@ define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
 define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 ; GFX906-LABEL: v_sdot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fnegf32_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fnegf32_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %cast.neg.c = bitcast float %neg.c to i32
@@ -275,24 +275,24 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
 ; GFX906-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -303,24 +303,24 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
 define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -330,24 +330,24 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_sdot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX908-NEXT:    v_dot2c_i32_i16_e32 v2, v0, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 6a79ad85a9a2876..7147acfdf50374e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -6,15 +6,15 @@
 define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot4c_i32_i8_e32 v2, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
   ret i32 %r
@@ -23,14 +23,14 @@ define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
 define i32 @v_sdot4_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_clamp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 true)
   ret i32 %r
@@ -40,7 +40,6 @@ define i32 @v_sdot4_clamp(i32 %a, i32 %b, i32 %c) {
 define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_cast_v4i8:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v10, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -58,11 +57,11 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_cast_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
@@ -80,6 +79,7 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX10-NEXT:    v_dot4c_i32_i8_e32 v8, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %a.cast = bitcast <4 x i8> %a to i32
   %b.cast = bitcast <4 x i8> %b to i32
@@ -90,17 +90,17 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_fnegf32_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_fnegf32_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    v_dot4c_i32_i8_e32 v2, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32
@@ -111,17 +111,17 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot4_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    v_dot4c_i32_i8_e32 v2, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
index 88fdfff28961298..3c2b9c16371d9ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
@@ -6,14 +6,14 @@
 define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 false)
   ret i32 %r
@@ -22,14 +22,14 @@ define i32 @v_sdot8(i32 %a, i32 %b, i32 %c) {
 define i32 @v_sdot8_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_clamp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 true)
   ret i32 %r
@@ -46,16 +46,16 @@ define i32 @v_sdot8_clamp(i32 %a, i32 %b, i32 %c) {
 define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8_fnegf32_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_fnegf32_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32
@@ -66,16 +66,16 @@ define i32 @v_sdot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 define i32 @v_sdot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX906-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sdot8_fnegv2f16_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    v_dot8_i32_i4 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
index 81fafa084eff9be..b57439f6954c84a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll
@@ -6,8 +6,8 @@ declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1
 define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -17,8 +17,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -28,8 +28,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -39,8 +39,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -52,8 +52,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -63,8 +63,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
@@ -74,8 +74,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -85,8 +85,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
index c66a4194253e9c0..92191ee991e399b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll
@@ -6,8 +6,8 @@ declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1
 define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -17,8 +17,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -28,8 +28,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -39,8 +39,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -52,8 +52,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -63,8 +63,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
@@ -74,8 +74,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -85,8 +85,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index 51e1fee0db249e9..69e35fa071c42f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -8,14 +8,14 @@
 define double @v_trig_preop_f64(double %a, i32 %b) {
 ; GCN-LABEL: v_trig_preop_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trig_preop_f64 v[0:1], v[0:1], v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_trig_preop_f64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_trig_preop_f64 v[0:1], v[0:1], v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b)
   ret double %result
@@ -24,14 +24,14 @@ define double @v_trig_preop_f64(double %a, i32 %b) {
 define double @v_trig_preop_f64_imm(double %a, i32 %b) {
 ; GCN-LABEL: v_trig_preop_f64_imm:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trig_preop_f64 v[0:1], v[0:1], 7
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_trig_preop_f64_imm:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_trig_preop_f64 v[0:1], v[0:1], 7
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7)
   ret double %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index 43a0f018dc1cd22..d1ae9c02667b163 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -4,8 +4,8 @@
 define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
 ; GFX6-LABEL: v_bfe_i32_arg_arg_arg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %bfe_i32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2)
   ret i32 %bfe_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
index 20e054d1eec0142..7f067500e68bb5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
@@ -7,20 +7,20 @@
 define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -29,20 +29,20 @@ define i32 @v_udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_udot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_clamp:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_clamp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
   ret i32 %r
@@ -76,20 +76,20 @@ define amdgpu_ps float @v_udot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inr
 define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
   ret i32 %r
@@ -98,20 +98,20 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
 define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -120,20 +120,20 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
 define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
   ret i32 %r
@@ -142,20 +142,20 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
 define i32 @v_udot2_inline_literal_a_b_c() {
 ; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
   ret i32 %r
@@ -164,20 +164,20 @@ define i32 @v_udot2_inline_literal_a_b_c() {
 define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
 ; GFX906-LABEL: v_udot2_inline_literal_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_inline_literal_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_inline_literal_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, 7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
   ret i32 %r
@@ -186,20 +186,20 @@ define i32 @v_udot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
 define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_fneg_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fneg_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fneg_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
@@ -210,20 +210,20 @@ define i32 @v_udot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_fneg_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fneg_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fneg_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
@@ -234,23 +234,23 @@ define i32 @v_udot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
 define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 ; GFX906-LABEL: v_udot2_fnegf32_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fnegf32_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fnegf32_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %cast.neg.c = bitcast float %neg.c to i32
@@ -261,23 +261,23 @@ define i32 @v_udot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
 define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
 ; GFX906-LABEL: v_udot2_fnegv2f16_c:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_fnegv2f16_c:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_fnegv2f16_c:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg <2 x half> %c
   %cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -288,23 +288,23 @@ define i32 @v_udot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
 define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_shuffle10_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_a:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_a:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -314,23 +314,23 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
 ; GFX906-LABEL: v_udot2_shuffle10_b:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX906-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: v_udot2_shuffle10_b:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX908-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot2_shuffle10_b:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
 ; GFX10-NEXT:    v_dot2_u32_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
   %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index 3e2a78d8fef8184..1a40c6b7ca4f961 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -7,14 +7,14 @@
 define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot4:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 false)
   ret i32 %r
@@ -23,14 +23,14 @@ define i32 @v_udot4(i32 %a, i32 %b, i32 %c) {
 define i32 @v_udot4_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot4_clamp:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 true)
   ret i32 %r
@@ -40,7 +40,6 @@ define i32 @v_udot4_clamp(i32 %a, i32 %b, i32 %c) {
 define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_cast_v4i8:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v10, 8
 ; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -58,11 +57,11 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_udot4_cast_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
@@ -79,11 +78,11 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_udot4_cast_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -101,6 +100,7 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX11-NEXT:    v_or3_b32 v1, v4, v1, v5
 ; GFX11-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.cast = bitcast <4 x i8> %a to i32
   %b.cast = bitcast <4 x i8> %b to i32
@@ -111,16 +111,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_fnegf32_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot4_fnegf32_a:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10PLUS-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32
@@ -131,16 +131,16 @@ define i32 @v_udot4_fnegf32_a(float %a, i32 %b, i32 %c) {
 define i32 @v_udot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot4_fnegv2f16_a:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10PLUS-NEXT:    v_dot4_u32_u8 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
index 4f6fe0f797f4752..73616751c780f04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
@@ -7,14 +7,14 @@
 define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 false)
   ret i32 %r
@@ -23,14 +23,14 @@ define i32 @v_udot8(i32 %a, i32 %b, i32 %c) {
 define i32 @v_udot8_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8_clamp:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot8_clamp:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 true)
   ret i32 %r
@@ -47,16 +47,16 @@ define i32 @v_udot8_clamp(i32 %a, i32 %b, i32 %c) {
 define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8_fnegf32_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot8_fnegf32_a:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX10PLUS-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg float %a
   %cast.neg.a = bitcast float %neg.a to i32
@@ -67,16 +67,16 @@ define i32 @v_udot8_fnegf32_a(float %a, i32 %b, i32 %c) {
 define i32 @v_udot8_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
 ; GFX906-LABEL: v_udot8_fnegv2f16_a:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX906-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_udot8_fnegv2f16_a:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10PLUS-NEXT:    v_dot8_u32_u4 v0, v0, v1, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index b169063d67872df..75d5defb77b7c5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -6,7 +6,6 @@
 define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-LABEL: v_powi_f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
@@ -21,11 +20,11 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_powi_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_log_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -34,11 +33,11 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -50,6 +49,7 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %l.cast = bitcast i16 %l to half
   %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r)
@@ -60,7 +60,6 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 define float @v_powi_f32(float %l, i32 %r) {
 ; GFX78-LABEL: v_powi_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -81,11 +80,11 @@ define float @v_powi_f32(float %l, i32 %r) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
@@ -105,6 +104,7 @@ define float @v_powi_f32(float %l, i32 %r) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 %r)
   ret float %res
@@ -113,14 +113,14 @@ define float @v_powi_f32(float %l, i32 %r) {
 define float @v_powi_0_f32(float %l) {
 ; GFX78-LABEL: v_powi_0_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_0_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 0)
   ret float %res
@@ -143,7 +143,6 @@ define float @v_powi_1_f32(float %l) {
 define float @v_powi_neg1_f32(float %l) {
 ; GFX78-LABEL: v_powi_neg1_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -163,11 +162,11 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg1_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -186,6 +185,7 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -1)
   ret float %res
@@ -194,7 +194,6 @@ define float @v_powi_neg1_f32(float %l) {
 define float @v_powi_2_f32(float %l) {
 ; GFX78-LABEL: v_powi_2_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -214,11 +213,11 @@ define float @v_powi_2_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_2_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -237,6 +236,7 @@ define float @v_powi_2_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 2)
   ret float %res
@@ -245,7 +245,6 @@ define float @v_powi_2_f32(float %l) {
 define float @v_powi_neg2_f32(float %l) {
 ; GFX78-LABEL: v_powi_neg2_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -265,11 +264,11 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg2_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -288,6 +287,7 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -2)
   ret float %res
@@ -296,7 +296,6 @@ define float @v_powi_neg2_f32(float %l) {
 define float @v_powi_4_f32(float %l) {
 ; GFX78-LABEL: v_powi_4_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -316,11 +315,11 @@ define float @v_powi_4_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_4_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -339,6 +338,7 @@ define float @v_powi_4_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 4)
   ret float %res
@@ -347,7 +347,6 @@ define float @v_powi_4_f32(float %l) {
 define float @v_powi_8_f32(float %l) {
 ; GFX78-LABEL: v_powi_8_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -367,11 +366,11 @@ define float @v_powi_8_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_8_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -390,6 +389,7 @@ define float @v_powi_8_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 8)
   ret float %res
@@ -398,7 +398,6 @@ define float @v_powi_8_f32(float %l) {
 define float @v_powi_16_f32(float %l) {
 ; GFX78-LABEL: v_powi_16_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -418,11 +417,11 @@ define float @v_powi_16_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -441,6 +440,7 @@ define float @v_powi_16_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 16)
   ret float %res
@@ -449,7 +449,6 @@ define float @v_powi_16_f32(float %l) {
 define float @v_powi_128_f32(float %l) {
 ; GFX78-LABEL: v_powi_128_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -469,11 +468,11 @@ define float @v_powi_128_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_128_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -492,6 +491,7 @@ define float @v_powi_128_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 128)
   ret float %res
@@ -500,7 +500,6 @@ define float @v_powi_128_f32(float %l) {
 define float @v_powi_neg128_f32(float %l) {
 ; GFX78-LABEL: v_powi_neg128_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX78-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX78-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -520,11 +519,11 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX78-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GFX78-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg128_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -543,6 +542,7 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -128)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index c28d204784d4b2a..d1357ce4fe6e1bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -9,14 +9,12 @@
 define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v3, v[0:1], off offset:1
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
@@ -50,21 +48,20 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
 ; GFX9-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
 ; GFX7-NOUNALIGNED:       ; %bb.0:
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
@@ -110,11 +107,11 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v0, v2, v1
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v4, v3
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v6, v5
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v3i32_align1:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -160,6 +157,7 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v4, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v6, v5
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 1
   ret <3 x i32> %load
@@ -168,14 +166,12 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX9-UNALIGNED:       ; %bb.0:
-; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX9-NOUNALIGNED:       ; %bb.0:
-; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v3, v[0:1], off offset:2
 ; GFX9-NOUNALIGNED-NEXT:    global_load_ushort v4, v[0:1], off offset:4
@@ -188,21 +184,20 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
 ; GFX9-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
+; GFX9-NOUNALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s6, 0
 ; GFX7-UNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-UNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-UNALIGNED-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
 ; GFX7-NOUNALIGNED:       ; %bb.0:
-; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NOUNALIGNED-NEXT:    s_mov_b64 s[4:5], 0
@@ -224,11 +219,11 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX7-NOUNALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX7-NOUNALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NOUNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v3i32_align2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -250,6 +245,7 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v6
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v7
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 2
   ret <3 x i32> %load
@@ -258,24 +254,21 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_v3i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_v3i32_align4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v3i32_align4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -284,7 +277,7 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
@@ -293,24 +286,21 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
 define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_i96_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_i96_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_i96_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -319,7 +309,7 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load i96, ptr addrspace(4) %ptr, align 8
   ret i96 %load
@@ -328,24 +318,21 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_v3i32_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_v3i32_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v3i32_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -354,7 +341,7 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(1)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 8
   ret <3 x i32> %load
@@ -363,14 +350,12 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
 define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_v6i16_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_v6i16_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -382,11 +367,11 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v8
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v6i16_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -399,6 +384,7 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v7
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <6 x i16>, ptr addrspace(4) %ptr, align 8
   ret <6 x i16> %load
@@ -407,7 +393,6 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
 define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_v12i8_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
@@ -423,11 +408,11 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v12
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_v12i8_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -446,11 +431,11 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v13
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v12
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v12i8_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -469,6 +454,7 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v11, 24, v8
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v13
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <12 x i8>, ptr addrspace(4) %ptr, align 8
   ret <12 x i8> %load
@@ -477,29 +463,26 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
 define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: v_load_constant_v3i32_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_load_constant_v3i32_align16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: v_load_constant_v3i32_align16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
   ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
index fef672570352c37..1c7c5a71ad4e50d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll
@@ -10,31 +10,27 @@
 define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b128 v[0:3], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b128 v[0:3], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b128 v[0:3], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr
   ret <4 x i32> %load
@@ -43,7 +39,6 @@ define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -52,15 +47,12 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
 ; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_or3_b32 v4, v2, v3, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v3, v1
@@ -85,18 +77,17 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
 ; GFX9-NEXT:    v_or3_b32 v3, v0, v5, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
 ; GFX7-NEXT:    ds_read_u8 v2, v0
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
@@ -141,11 +132,11 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -162,39 +153,28 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:15
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:14
-; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v3, v1
 ; GFX10-NEXT:    v_or3_b32 v1, v5, v6, v4
 ; GFX10-NEXT:    v_or3_b32 v2, v8, v9, v7
 ; GFX10-NEXT:    v_or3_b32 v3, v11, v12, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
 ; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
@@ -211,35 +191,25 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v14, v0 offset:13
 ; GFX11-NEXT:    ds_load_u8 v15, v0 offset:15
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:14
-; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(13)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(9)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; GFX11-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 24, v15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
 ; GFX11-NEXT:    v_or3_b32 v0, v2, v3, v1
 ; GFX11-NEXT:    v_or3_b32 v1, v5, v6, v4
 ; GFX11-NEXT:    v_or3_b32 v2, v8, v9, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or3_b32 v3, v11, v12, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
   ret <4 x i32> %load
@@ -248,7 +218,6 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16 v1, v0
 ; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -257,19 +226,16 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
@@ -279,22 +245,20 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX7-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v1, v0
 ; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -303,19 +267,16 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v1, v0
 ; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
@@ -324,14 +285,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
 ; GFX11-NEXT:    ds_load_u16 v7, v0 offset:12
 ; GFX11-NEXT:    ds_load_u16 v8, v0 offset:14
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 2
   ret <4 x i32> %load
@@ -340,39 +299,35 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 4
   ret <4 x i32> %load
@@ -381,33 +336,29 @@ define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_2addr_b64 v[0:3], v0 offset1:1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 8
   ret <4 x i32> %load
@@ -416,31 +367,27 @@ define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b128 v[0:3], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b128 v[0:3], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b128 v[0:3], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 16
   ret <4 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
index 225f2165977b3c8..25c99d7869ce6df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll
@@ -10,31 +10,27 @@
 define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b96 v[0:2], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b96 v[0:2], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b96 v[0:2], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr
   ret <3 x i32> %load
@@ -43,7 +39,6 @@ define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -52,19 +47,16 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
 ; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
 ; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_or3_b32 v3, v2, v3, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:8
 ; GFX9-NEXT:    ds_read_u8 v4, v0 offset:9
 ; GFX9-NEXT:    ds_read_u8 v5, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
@@ -76,18 +68,17 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    v_or3_b32 v1, v6, v7, v1
 ; GFX9-NEXT:    v_or3_b32 v2, v0, v4, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
 ; GFX7-NEXT:    ds_read_u8 v2, v0
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
@@ -120,11 +111,11 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -137,32 +128,24 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:11
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:10
-; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v3, v1
 ; GFX10-NEXT:    v_or3_b32 v1, v5, v6, v4
 ; GFX10-NEXT:    v_or3_b32 v2, v8, v9, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
 ; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
@@ -175,28 +158,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
 ; GFX11-NEXT:    ds_load_u8 v11, v0 offset:11
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:10
-; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(9)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; GFX11-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    v_or3_b32 v0, v2, v3, v1
 ; GFX11-NEXT:    v_or3_b32 v1, v5, v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or3_b32 v2, v8, v9, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
   ret <3 x i32> %load
@@ -205,24 +181,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16 v1, v0
 ; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u16 v1, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:2
@@ -230,49 +203,44 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX7-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v1, v0
 ; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v1, v0
 ; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
 ; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
 ; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
 ; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 2
   ret <3 x i32> %load
@@ -281,39 +249,35 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 4
   ret <3 x i32> %load
@@ -322,39 +286,35 @@ define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 8
   ret <3 x i32> %load
@@ -363,31 +323,27 @@ define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b96 v[0:2], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b96 v[0:2], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b96 v[0:2], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 16
   ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
index 06bf71f5e122cc9..7a1a0957eef5e82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll
@@ -11,21 +11,18 @@
 define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b128 v[0:3], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
 ; GFX7-NEXT:    ds_read_u8 v2, v0
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
@@ -70,22 +67,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
   ret <4 x i32> %load
@@ -94,21 +90,18 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b96 v[0:2], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
 ; GFX7-NEXT:    ds_read_u8 v2, v0
 ; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
@@ -141,22 +134,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
   ret <3 x i32> %load
@@ -165,14 +157,12 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 define void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX9-LABEL: store_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b128 v0, v[1:4]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: store_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX7-NEXT:    v_bfe_u32 v6, v1, 8, 8
@@ -202,22 +192,20 @@ define void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
 ; GFX7-NEXT:    ds_write_b8 v0, v2 offset:15
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_lds_v4i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX10-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_lds_v4i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_store_b128 v0, v[1:4]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i32> %x, ptr addrspace(3) %out, align 1
   ret void
@@ -226,14 +214,12 @@ define void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i32> %x) {
 define void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX9-LABEL: store_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b96 v0, v[1:3]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: store_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
@@ -256,22 +242,20 @@ define void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i32> %x) {
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX7-NEXT:    ds_write_b8 v0, v2 offset:11
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_lds_v3i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX10-NEXT:    ds_write_b32 v0, v3 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_lds_v3i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_store_b96 v0, v[1:3]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i32> %x, ptr addrspace(3) %out, align 1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index da9601a8998c2ba..aa1b7b43b57a933 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -167,10 +167,10 @@ bb2:
 define void @localize_internal_globals(i1 %cond) {
 ; GFX9-LABEL: localize_internal_globals:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_3
@@ -179,6 +179,7 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX9-NEXT:  .LBB2_2: ; %bb2
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ; GFX9-NEXT:  .LBB2_3: ; %bb1
 ; GFX9-NEXT:    s_getpc_b64 s[6:7]
@@ -209,6 +210,7 @@ define void @localize_internal_globals(i1 %cond) {
 ; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br i1 %cond, label %bb0, label %bb1
@@ -231,9 +233,9 @@ bb2:
 define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
 ; GFX9-LABEL: sink_null_insert_pt:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s16, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 980ba3da4bac7ae..eb550514a86bb4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -8,30 +8,30 @@
 define i8 @v_lshr_i8(i8 %value, i8 %amount) {
 ; GFX6-LABEL: v_lshr_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i8 %value, %amount
   ret i8 %result
@@ -40,29 +40,29 @@ define i8 @v_lshr_i8(i8 %value, i8 %amount) {
 define i8 @v_lshr_i8_7(i8 %value) {
 ; GFX6-LABEL: v_lshr_i8_7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 7, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i8_7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 7
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i8_7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 7
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 7, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -102,18 +102,18 @@ define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
 define i24 @v_lshr_i24(i24 %value, i24 %amount) {
 ; GCN-LABEL: v_lshr_i24:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i24 %value, %amount
   ret i24 %result
@@ -122,14 +122,14 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
 define i24 @v_lshr_i24_7(i24 %value) {
 ; GCN-LABEL: v_lshr_i24_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 7, 17
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i24_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_u32 v0, v0, 7, 17
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i24 %value, 7
   ret i24 %result
@@ -168,14 +168,14 @@ define amdgpu_ps i24 @s_lshr_i24_7(i24 inreg %value) {
 define i32 @v_lshr_i32(i32 %value, i32 %amount) {
 ; GCN-LABEL: v_lshr_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i32 %value, %amount
   ret i32 %result
@@ -184,14 +184,14 @@ define i32 @v_lshr_i32(i32 %value, i32 %amount) {
 define i32 @v_lshr_i32_31(i32 %value) {
 ; GCN-LABEL: v_lshr_i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i32 %value, 31
   ret i32 %result
@@ -268,16 +268,16 @@ define amdgpu_ps float @lshr_i32_vs(i32 %value, i32 inreg %amount) {
 define <2 x i32> @v_lshr_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 ; GCN-LABEL: v_lshr_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i32> %value, %amount
   ret <2 x i32> %result
@@ -286,16 +286,16 @@ define <2 x i32> @v_lshr_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 define <2 x i32> @v_lshr_v2i32_31(<2 x i32> %value) {
 ; GCN-LABEL: v_lshr_v2i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, 31, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i32> %value, <i32 31, i32 31>
   ret <2 x i32> %result
@@ -320,18 +320,18 @@ define amdgpu_ps <2 x i32> @s_lshr_v2i32(<2 x i32> inreg %value, <2 x i32> inreg
 define <3 x i32> @v_lshr_v3i32(<3 x i32> %value, <3 x i32> %amount) {
 ; GCN-LABEL: v_lshr_v3i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v3, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v3, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <3 x i32> %value, %amount
   ret <3 x i32> %result
@@ -358,20 +358,20 @@ define amdgpu_ps <3 x i32> @s_lshr_v3i32(<3 x i32> inreg %value, <3 x i32> inreg
 define <4 x i32> @v_lshr_v4i32(<4 x i32> %value, <4 x i32> %amount) {
 ; GCN-LABEL: v_lshr_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, v5, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <4 x i32> %value, %amount
   ret <4 x i32> %result
@@ -400,22 +400,22 @@ define amdgpu_ps <4 x i32> @s_lshr_v4i32(<4 x i32> inreg %value, <4 x i32> inreg
 define <5 x i32> @v_lshr_v5i32(<5 x i32> %value, <5 x i32> %amount) {
 ; GCN-LABEL: v_lshr_v5i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v5, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, v9, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v5, v0
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v4, v9, v4
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <5 x i32> %value, %amount
   ret <5 x i32> %result
@@ -446,7 +446,6 @@ define amdgpu_ps <5 x i32> @s_lshr_v5i32(<5 x i32> inreg %value, <5 x i32> inreg
 define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-LABEL: v_lshr_v16i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
@@ -465,11 +464,11 @@ define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, v16, v15
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
@@ -488,11 +487,11 @@ define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v15, v31, v15
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
@@ -511,6 +510,7 @@ define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v15, v31, v15
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <16 x i32> %value, %amount
   ret <16 x i32> %result
@@ -563,28 +563,28 @@ define amdgpu_ps <16 x i32> @s_lshr_v16i32(<16 x i32> inreg %value, <16 x i32> i
 define i16 @v_lshr_i16(i16 %value, i16 %amount) {
 ; GFX6-LABEL: v_lshr_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i16 %value, %amount
   ret i16 %result
@@ -593,26 +593,26 @@ define i16 @v_lshr_i16(i16 %value, i16 %amount) {
 define i16 @v_lshr_i16_15(i16 %value) {
 ; GFX6-LABEL: v_lshr_i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 15, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 15, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 15, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i16 %value, 15
   ret i16 %result
@@ -705,33 +705,33 @@ define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
 define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i16> %value, %amount
   ret <2 x i16> %result
@@ -740,30 +740,30 @@ define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_lshr_v2i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 15, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 15, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v2i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result
@@ -902,7 +902,6 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
@@ -919,31 +918,32 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v2, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v3, v1
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <4 x i16> %value, %amount
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -1049,7 +1049,6 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_lshr_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v8, v0
@@ -1082,11 +1081,11 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v8, v4, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v1
@@ -1099,24 +1098,25 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v5, v1
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <8 x i16> %value, %amount
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -1251,26 +1251,26 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 define i64 @v_lshr_i64(i64 %value, i64 %amount) {
 ; GFX6-LABEL: v_lshr_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i64 %value, %amount
   ret i64 %result
@@ -1279,16 +1279,16 @@ define i64 @v_lshr_i64(i64 %value, i64 %amount) {
 define i64 @v_lshr_i64_63(i64 %value) {
 ; GCN-LABEL: v_lshr_i64_63:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i64_63:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i64 %value, 63
   ret i64 %result
@@ -1297,16 +1297,16 @@ define i64 @v_lshr_i64_63(i64 %value) {
 define i64 @v_lshr_i64_33(i64 %value) {
 ; GCN-LABEL: v_lshr_i64_33:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i64_33:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 1, v1
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i64 %value, 33
   ret i64 %result
@@ -1315,22 +1315,22 @@ define i64 @v_lshr_i64_33(i64 %value) {
 define i64 @v_lshr_i64_32(i64 %value) {
 ; GCN-LABEL: v_lshr_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i64_32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i64_32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i64 %value, 32
   ret i64 %result
@@ -1339,26 +1339,26 @@ define i64 @v_lshr_i64_32(i64 %value) {
 define i64 @v_lshr_i64_31(i64 %value) {
 ; GFX6-LABEL: v_lshr_i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i64 %value, 31
   ret i64 %result
@@ -1493,30 +1493,30 @@ define amdgpu_ps <2 x float> @lshr_i64_vs(i64 %value, i64 inreg %amount) {
 define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX6-LABEL: v_lshr_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], v4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i64> %value, %amount
   ret <2 x i64> %result
@@ -1525,30 +1525,30 @@ define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 define <2 x i64> @v_lshr_v2i64_31(<2 x i64> %value) {
 ; GFX6-LABEL: v_lshr_v2i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_v2i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_v2i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_lshr_v2i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr <2 x i64> %value, <i64 31, i64 31>
   ret <2 x i64> %result
@@ -1573,7 +1573,6 @@ define amdgpu_ps <2 x i64> @s_lshr_v2i64(<2 x i64> inreg %value, <2 x i64> inreg
 define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX6-LABEL: v_lshr_i65:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v3
@@ -1591,11 +1590,11 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v3
@@ -1613,11 +1612,11 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v3
@@ -1635,11 +1634,11 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
@@ -1657,11 +1656,11 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v10, 64, v3
@@ -1678,6 +1677,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, %amount
   ret i65 %result
@@ -1686,7 +1686,6 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) {
 define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-LABEL: v_lshr_i65_33:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
@@ -1694,11 +1693,11 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_lshr_i65_33:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -1706,11 +1705,11 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_lshr_i65_33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -1718,11 +1717,11 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_lshr_i65_33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
@@ -1730,17 +1729,18 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_lshr_i65_33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
index 0f84f12a12574d6..47baa9ea6c91535 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
@@ -14,8 +14,8 @@ declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
 define i32 @test_umin_i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_umin_i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_min_u32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   ret i32 %r
@@ -24,8 +24,8 @@ define i32 @test_umin_i32(i32 %a, i32 %b) {
 define i32 @test_umax_i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_umax_i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_max_u32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   ret i32 %r
@@ -34,8 +34,8 @@ define i32 @test_umax_i32(i32 %a, i32 %b) {
 define i32 @test_smin_i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_smin_i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_min_i32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   ret i32 %r
@@ -44,8 +44,8 @@ define i32 @test_smin_i32(i32 %a, i32 %b) {
 define i32 @test_smax_i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_smax_i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_max_i32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   ret i32 %r
@@ -54,11 +54,11 @@ define i32 @test_smax_i32(i32 %a, i32 %b) {
 define <4 x i32> @test_umin_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_umin_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_min_u32_e32 v0, v0, v4
 ; CHECK-NEXT:    v_min_u32_e32 v1, v1, v5
 ; CHECK-NEXT:    v_min_u32_e32 v2, v2, v6
 ; CHECK-NEXT:    v_min_u32_e32 v3, v3, v7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -67,11 +67,11 @@ define <4 x i32> @test_umin_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test_umax_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_umax_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_max_u32_e32 v0, v0, v4
 ; CHECK-NEXT:    v_max_u32_e32 v1, v1, v5
 ; CHECK-NEXT:    v_max_u32_e32 v2, v2, v6
 ; CHECK-NEXT:    v_max_u32_e32 v3, v3, v7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -80,11 +80,11 @@ define <4 x i32> @test_umax_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test_smin_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_smin_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_min_i32_e32 v0, v0, v4
 ; CHECK-NEXT:    v_min_i32_e32 v1, v1, v5
 ; CHECK-NEXT:    v_min_i32_e32 v2, v2, v6
 ; CHECK-NEXT:    v_min_i32_e32 v3, v3, v7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
@@ -93,11 +93,11 @@ define <4 x i32> @test_smin_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @test_smax_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_smax_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_max_i32_e32 v0, v0, v4
 ; CHECK-NEXT:    v_max_i32_e32 v1, v1, v5
 ; CHECK-NEXT:    v_max_i32_e32 v2, v2, v6
 ; CHECK-NEXT:    v_max_i32_e32 v3, v3, v7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %r
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index eb3f74be71de017..4016ae120ae29af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -38,28 +38,28 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
 define i16 @v_mul_i16(i16 %num, i16 %den) {
 ; GFX7-LABEL: v_mul_i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_mul_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
@@ -102,28 +102,28 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 ; GFX7-LABEL: v_mul_i16_zeroext:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i16_zeroext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_mul_i16_zeroext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
@@ -166,32 +166,32 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
 define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
 ; GFX7-LABEL: v_mul_i16_signext:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i16_signext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i16_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_mul_i16_signext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i16 %num, %den
   ret i16 %result
@@ -214,14 +214,14 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
 define i32 @v_mul_i32(i32 %num, i32 %den) {
 ; GCN-LABEL: v_mul_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_mul_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i32 %num, %den
   ret i32 %result
@@ -246,16 +246,16 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
 define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GCN-LABEL: v_mul_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_mul_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = mul <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -366,33 +366,33 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
 define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GCN-LABEL: v_mul_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, v1
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
 ; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v5, v2
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v4, v3
 ; GFX11-NEXT:    v_mul_lo_u32 v2, v5, v2
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i64 %num, %den
   ret i64 %result
@@ -498,7 +498,6 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GCN-LABEL: v_mul_i96:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v6, v0
 ; GCN-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
@@ -508,11 +507,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, v8
 ; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
 ; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i96:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v3
@@ -522,11 +521,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    v_add3_u32 v2, v5, v8, v2
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i96:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
 ; GFX11-NEXT:    v_mul_lo_u32 v2, v2, v3
 ; GFX11-NEXT:    v_mul_lo_u32 v5, v6, v5
@@ -535,6 +534,7 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX11-NEXT:    v_add3_u32 v2, v5, v8, v2
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
@@ -717,7 +717,6 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-LABEL: v_mul_i128:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
@@ -734,11 +733,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
 ; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
 ; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
@@ -755,11 +754,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
@@ -776,11 +775,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v2
@@ -798,11 +797,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v3, v4, v5, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v2
 ; GFX11-NEXT:    v_mul_lo_u32 v3, v3, v4
@@ -819,6 +818,7 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v3, v4, v5, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i128 %num, %den
   ret i128 %result
@@ -1633,7 +1633,6 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-LABEL: v_mul_i256:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v17, v1
@@ -1698,11 +1697,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
 ; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
 ; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v17, v1
@@ -1767,11 +1766,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX8-NEXT:    v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
 ; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v9, v20, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v17, v1
@@ -1836,11 +1835,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], v9, v23, s[4:5]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v20, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
@@ -1907,11 +1906,11 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v8, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i256:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, v8
 ; GFX11-NEXT:    v_mul_lo_u32 v27, v6, v9
@@ -1977,6 +1976,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, v8, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = mul i256 %num, %den
   ret i256 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
index 1a474a352074213..dd6b5983e374d38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll
@@ -7,22 +7,22 @@
 define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; GFX9-LABEL: v_mul_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %a, %b
   ret <2 x i16> %mul
@@ -31,23 +31,23 @@ define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
 define <2 x i16> @v_mul_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-LABEL: v_mul_v2i16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_v2i16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_v2i16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
@@ -58,23 +58,23 @@ define <2 x i16> @v_mul_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 define <2 x i16> @v_mul_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_mul_v2i16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_v2i16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_v2i16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
@@ -85,24 +85,24 @@ define <2 x i16> @v_mul_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 define <2 x i16> @v_mul_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_v2i16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %neg.b = fneg <2 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index fec564f9b45dbce..89a19ba5c39f22f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -147,11 +147,11 @@ bb.1:
 define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
 ; GCN-LABEL: func_non_entry_block_static_alloca_align4:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_3
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
@@ -179,6 +179,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 
 entry:
@@ -210,12 +211,12 @@ bb.2:
 define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
 ; GCN-LABEL: func_non_entry_block_static_alloca_align64:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
 ; GCN-NEXT:    s_addk_i32 s32, 0x2000
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
@@ -240,6 +241,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_addk_i32 s32, 0xe000
 ; GCN-NEXT:    s_mov_b32 s33, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %arg.cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e7119c89ac06cd1..364a9bf0f5344c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -98,16 +98,16 @@ define amdgpu_ps { i32, i32 } @s_orn2_i32_multi_foldable_use(i32 inreg %src0, i3
 define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
 ; GCN-LABEL: v_orn2_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v1, v1
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v1, v1
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i32 %src1, -1
   %or = or i32 %src0, %not.src1
@@ -246,20 +246,20 @@ define amdgpu_ps { i64, i64 } @s_orn2_i64_multi_use(i64 inreg %src0, i64 inreg %
 define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
 ; GCN-LABEL: v_orn2_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-NEXT:    v_not_b32_e32 v3, v3
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v2, v2
 ; GFX10PLUS-NEXT:    v_not_b32_e32 v3, v3
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i64 %src1, -1
   %or = or i64 %src0, %not.src1
@@ -442,16 +442,16 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i1
 define i16 @v_orn2_i16(i16 %src0, i16 %src1) {
 ; GCN-LABEL: v_orn2_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor i16 %src1, -1
   %or = or i16 %src0, %not.src1
@@ -654,7 +654,6 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %
 define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -664,20 +663,21 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 ; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_orn2_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v1, -1, v1
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1>
   %or = or <2 x i16> %src0, %not.src1
@@ -864,7 +864,6 @@ define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -880,24 +879,25 @@ define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_orn2_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -11, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_v3i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -11, v3
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11>
   %or = or <3 x i16> %src0, %not.src1
@@ -1097,7 +1097,6 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %
 define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -1116,24 +1115,25 @@ define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) {
 ; GFX6-NEXT:    v_or_b32_e32 v2, v1, v3
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_orn2_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_orn2_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v2, -1, v2
 ; GFX10PLUS-NEXT:    v_xor_b32_e32 v3, -1, v3
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1>
   %or = or <4 x i16> %src0, %not.src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0ea89d98a..5871e3f2ed95dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -8,7 +8,6 @@
 define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_saddsat_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
@@ -19,11 +18,11 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
@@ -34,24 +33,25 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
 ; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
@@ -118,7 +118,6 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_saddsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
@@ -129,11 +128,11 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -144,24 +143,25 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -228,7 +228,6 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-LABEL: v_saddsat_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
@@ -255,11 +254,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
@@ -283,11 +282,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -301,11 +300,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -319,11 +318,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -339,6 +338,7 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -499,7 +499,6 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-LABEL: v_saddsat_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
@@ -558,11 +557,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
@@ -613,11 +612,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
@@ -645,11 +644,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -676,11 +675,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -707,6 +706,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1000,7 +1000,6 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-LABEL: v_saddsat_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
@@ -1011,11 +1010,11 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -1026,24 +1025,25 @@ define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
   ret i24 %result
@@ -1107,7 +1107,6 @@ define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_saddsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
@@ -1115,11 +1114,11 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x80000000, v3
@@ -1127,18 +1126,19 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -1260,7 +1260,6 @@ define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
@@ -1275,11 +1274,11 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
@@ -1294,20 +1293,21 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -1374,7 +1374,6 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
@@ -1397,11 +1396,11 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v8, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
@@ -1424,22 +1423,23 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_add_i32 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_add_i32 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -1525,7 +1525,6 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
@@ -1556,11 +1555,11 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v10, 0, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
@@ -1591,24 +1590,25 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_add_i32 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_add_i32 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_add_i32 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -1713,7 +1713,6 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
@@ -1751,11 +1750,11 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v12, 0, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX8-NEXT:    v_max_i32_e32 v10, 0, v0
@@ -1793,26 +1792,27 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v5i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v5 clamp
 ; GFX9-NEXT:    v_add_i32 v1, v1, v6 clamp
 ; GFX9-NEXT:    v_add_i32 v2, v2, v7 clamp
 ; GFX9-NEXT:    v_add_i32 v3, v3, v8 clamp
 ; GFX9-NEXT:    v_add_i32 v4, v4, v9 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v5 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v6 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v2, v2, v7 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v3, v3, v8 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v4, v4, v9 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
   ret <5 x i32> %result
@@ -1936,7 +1936,6 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_i32_e32 v32, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v31, 1
 ; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v31, v32
@@ -2053,11 +2052,11 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i32_e32 v32, 0, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v31, 1
 ; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v31, v32
@@ -2174,11 +2173,11 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v16
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_add_i32 v1, v1, v17 clamp
@@ -2197,11 +2196,11 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_add_i32 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_add_nc_i32 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_add_nc_i32 v1, v1, v17 clamp
@@ -2220,11 +2219,11 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_add_nc_i32 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_i32 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_add_nc_i32 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_add_nc_i32 v1, v1, v17 clamp
@@ -2243,6 +2242,7 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_add_nc_i32 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_i32 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -2575,7 +2575,6 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_saddsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2586,11 +2585,11 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
@@ -2598,18 +2597,19 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -2748,7 +2748,6 @@ define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -2769,11 +2768,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v4, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_max_i16_e32 v3, 0, v0
@@ -2790,18 +2789,19 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v3
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -3034,7 +3034,6 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
@@ -3085,11 +3084,11 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
@@ -3122,20 +3121,21 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_add_u16_e32 v1, v1, v7
 ; GFX8-NEXT:    v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -3296,7 +3296,6 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v6i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v14, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
@@ -3371,11 +3370,11 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v10, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_max_i16_e32 v9, 0, v0
@@ -3424,22 +3423,23 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v11
 ; GFX8-NEXT:    v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v6i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
   %cast = bitcast <6 x i16> %result to <3 x float>
@@ -3648,7 +3648,6 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_min_i32_e32 v18, 0, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
@@ -3747,11 +3746,11 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v13, 0, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    v_max_i16_e32 v12, 0, v0
@@ -3816,24 +3815,25 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_add_u16_e32 v3, v3, v15
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -4101,7 +4101,6 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-LABEL: v_saddsat_i48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 16
@@ -4115,11 +4114,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 16
@@ -4133,11 +4132,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
@@ -4150,11 +4149,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_i48:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
@@ -4167,11 +4166,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_i48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
@@ -4183,6 +4182,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
@@ -4498,7 +4498,6 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_saddsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4508,11 +4507,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4522,11 +4521,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4536,11 +4535,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
@@ -4550,11 +4549,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
@@ -4563,6 +4562,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -4815,7 +4815,6 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4835,11 +4834,11 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4859,11 +4858,11 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4883,11 +4882,11 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
@@ -4906,11 +4905,11 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
@@ -4927,6 +4926,7 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -5645,7 +5645,6 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
@@ -5693,11 +5692,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v8
 ; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
@@ -5745,11 +5744,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v9, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v2, v10, vcc
@@ -5797,11 +5796,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v8
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
@@ -5848,11 +5847,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v8
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
@@ -5898,6 +5897,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll
index b1a36c7eaeedc37..57838600c90ff86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sbfx.ll
@@ -7,8 +7,8 @@
 define signext i8 @v_ashr_i8_i32(i32 %value) {
 ; GCN-LABEL: v_ashr_i8_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 4, 8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = ashr i32 %value, 4
  %2 = trunc i32 %1 to i8
@@ -18,8 +18,8 @@ define signext i8 @v_ashr_i8_i32(i32 %value) {
 define signext i16 @v_ashr_i16_i32(i32 %value) {
 ; GCN-LABEL: v_ashr_i16_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = ashr i32 %value, 9
  %2 = trunc i32 %1 to i16
@@ -29,8 +29,8 @@ define signext i16 @v_ashr_i16_i32(i32 %value) {
 define signext i8 @v_lshr_i8_i32(i32 %value) {
 ; GCN-LABEL: v_lshr_i8_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 4, 8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = lshr i32 %value, 4
  %2 = trunc i32 %1 to i8
@@ -40,8 +40,8 @@ define signext i8 @v_lshr_i8_i32(i32 %value) {
 define signext i16 @v_lshr_i16_i32(i32 %value) {
 ; GCN-LABEL: v_lshr_i16_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = lshr i32 %value, 9
  %2 = trunc i32 %1 to i16
@@ -52,10 +52,10 @@ define signext i16 @v_lshr_i16_i32(i32 %value) {
 define i64 @v_ashr_i64(i64 %value) {
 ; GCN-LABEL: v_ashr_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i64 v[0:1], 10, v[0:1]
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 4
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = ashr i64 %value, 10
  %2 = shl i64 %1, 60
@@ -66,10 +66,10 @@ define i64 @v_ashr_i64(i64 %value) {
 define i64 @v_lshr_i64(i64 %value) {
 ; GCN-LABEL: v_lshr_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i64 v[0:1], 10, v[0:1]
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 4
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = lshr i64 %value, 10
  %2 = shl i64 %1, 60
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 1061f0003bd4896..125c1d2a8898a84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -7,7 +7,6 @@
 define i32 @v_sdiv_i32(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_sdiv_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -36,11 +35,11 @@ define i32 @v_sdiv_i32(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; CGP-NEXT:    v_xor_b32_e32 v4, v2, v3
@@ -69,6 +68,7 @@ define i32 @v_sdiv_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i32 %num, %den
   ret i32 %result
@@ -151,7 +151,6 @@ define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) {
 define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_sdiv_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -208,11 +207,11 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -269,6 +268,7 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -277,7 +277,6 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_sdiv_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
@@ -302,6 +301,7 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i32 %num, 4096
   ret i32 %result
@@ -310,7 +310,6 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
@@ -352,11 +351,11 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
@@ -397,6 +396,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -405,7 +405,6 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_sdiv_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
@@ -430,6 +429,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i32 %num, 1235195
   ret i32 %result
@@ -438,7 +438,6 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_sdiv_v2i32_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
@@ -480,11 +479,11 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
@@ -525,6 +524,7 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
@@ -533,7 +533,6 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-LABEL: v_sdiv_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
@@ -563,6 +562,7 @@ define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
@@ -572,7 +572,6 @@ define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_sdiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
@@ -631,11 +630,11 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
@@ -694,6 +693,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -703,7 +703,6 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_sdiv_i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
@@ -734,11 +733,11 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -760,6 +759,7 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i32 %num, 16777215
   %den.mask = and i32 %den, 16777215
@@ -770,7 +770,6 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_sdiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -831,11 +830,11 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -878,6 +877,7 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
   %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 3eb6f1eced0957f..7d7e9ffb90023fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -7,13 +7,13 @@
 define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_sdiv_i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
@@ -22,6 +22,7 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
@@ -183,6 +184,7 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, %den
   ret i64 %result
@@ -377,7 +379,6 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_sdiv_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
@@ -643,11 +644,11 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v11, v1
 ; CGP-NEXT:    v_mov_b32_e32 v10, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
@@ -656,6 +657,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mov_b32_e32 v8, v2
 ; CGP-NEXT:    v_mov_b32_e32 v9, v3
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB2_2
@@ -831,6 +833,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB2_8
 ; CGP-NEXT:  .LBB2_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB2_7:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
@@ -990,6 +993,7 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -998,7 +1002,6 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
@@ -1119,6 +1122,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 4096
   ret i64 %result
@@ -1127,7 +1131,6 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
@@ -1363,11 +1366,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, 0xfffff000
@@ -1595,6 +1598,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1603,7 +1607,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
@@ -1724,6 +1727,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1732,7 +1736,6 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
@@ -1968,11 +1971,11 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, 0xffed2705
@@ -2200,6 +2203,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2208,7 +2212,6 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -2218,6 +2221,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_3
@@ -2226,6 +2230,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_4
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
@@ -2385,6 +2390,7 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
@@ -2394,7 +2400,6 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
@@ -2664,11 +2669,11 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
@@ -2680,6 +2685,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
@@ -2857,6 +2863,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB8_8
 ; CGP-NEXT:  .LBB8_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB8_7:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v10
@@ -3018,6 +3025,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -3027,7 +3035,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_sdiv_i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
@@ -3050,11 +3057,11 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
@@ -3069,6 +3076,7 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
   %den.mask = and i64 %den, 16777215
@@ -3079,7 +3087,6 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_sdiv_v2i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
@@ -3330,11 +3337,11 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
@@ -3363,6 +3370,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
   %den.mask = and <2 x i64> %den, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
index ee3bf9611199472..11fd301738e5729 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
@@ -4,9 +4,9 @@
 define half @test_s16(half %a) #0 {
 ; GCN-LABEL: test_s16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt half %a, 0.0
@@ -17,9 +17,9 @@ entry:
 define float @test_s32(float %a) #0 {
 ; GCN-LABEL: test_s32:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt float %a, 0.0
@@ -30,10 +30,10 @@ entry:
 define double @test_s64(double %a) #0 {
 ; GCN-LABEL: test_s64:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt double %a, 0.0
@@ -44,7 +44,6 @@ entry:
 define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; GCN-LABEL: test_v4s16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -60,6 +59,7 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <4 x half> %a, zeroinitializer
@@ -70,7 +70,6 @@ entry:
 define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; GCN-LABEL: test_v8s16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
@@ -100,6 +99,7 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
 ; GCN-NEXT:    v_lshl_or_b32 v3, v3, 16, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <8 x half> %a, zeroinitializer
@@ -110,11 +110,11 @@ entry:
 define <2 x float> @test_v2s32(<2 x float> %a) #0 {
 ; GCN-LABEL: test_v2s32:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <2 x float> %a, zeroinitializer
@@ -125,7 +125,6 @@ entry:
 define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; GCN-LABEL: test_v4s32:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v1
@@ -134,6 +133,7 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <4 x float> %a, zeroinitializer
@@ -144,13 +144,13 @@ entry:
 define <2 x double> @test_v2s64(<2 x double> %a) #0 {
 ; GCN-LABEL: test_v2s64:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[0:1]
 ; GCN-NEXT:    v_cmp_gt_f64_e64 s[4:5], 0, v[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <2 x double> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c0242..839d94da1901f58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -8,14 +8,14 @@
 define i8 @v_sext_inreg_i8_4(i8 %value) {
 ; GCN-LABEL: v_sext_inreg_i8_4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i8_4:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 4
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i8 %value, 4
   %ashr = ashr i8 %shl, 4
@@ -25,14 +25,14 @@ define i8 @v_sext_inreg_i8_4(i8 %value) {
 define i8 @v_sext_inreg_i8_7(i8 %value) {
 ; GCN-LABEL: v_sext_inreg_i8_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i8 %value, 7
   %ashr = ashr i8 %shl, 7
@@ -104,16 +104,16 @@ define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) {
 define i24 @v_sext_inreg_i24_12(i24 %value) {
 ; GCN-LABEL: v_sext_inreg_i24_12:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i24_12:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i24 %value, 12
   %ashr = ashr i24 %value, 12
@@ -123,14 +123,14 @@ define i24 @v_sext_inreg_i24_12(i24 %value) {
 define i24 @v_sext_inreg_i24_7(i24 %value) {
 ; GCN-LABEL: v_sext_inreg_i24_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 17
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i24_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 17
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i24 %value, 7
   %ashr = ashr i24 %shl, 7
@@ -170,14 +170,14 @@ define amdgpu_ps i24 @s_sext_inreg_i24_7(i24 inreg %value) {
 define i32 @v_sext_inreg_i32_3(i32 %value) {
 ; GCN-LABEL: v_sext_inreg_i32_3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 29
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i32_3:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 29
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %value, 3
   %ashr = ashr i32 %shl, 3
@@ -187,14 +187,14 @@ define i32 @v_sext_inreg_i32_3(i32 %value) {
 define i32 @v_sext_inreg_i32_31(i32 %value) {
 ; GCN-LABEL: v_sext_inreg_i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %value, 31
   %ashr = ashr i32 %value, 31
@@ -234,16 +234,16 @@ define amdgpu_ps i32 @s_sext_inreg_i32_31(i32 inreg %value) {
 define <2 x i32> @v_sext_inreg_v2i32_14(<2 x i32> %value) {
 ; GCN-LABEL: v_sext_inreg_v2i32_14:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 18
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 18
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i32_14:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 18
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 18
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i32> %value, <i32 14, i32 14>
   %ashr = ashr <2 x i32> %shl, <i32 14, i32 14>
@@ -253,16 +253,16 @@ define <2 x i32> @v_sext_inreg_v2i32_14(<2 x i32> %value) {
 define <2 x i32> @v_sext_inreg_v2i32_31(<2 x i32> %value) {
 ; GCN-LABEL: v_sext_inreg_v2i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i32> %value, <i32 31, i32 31>
   %shr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -289,18 +289,18 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v2i32_22(<2 x i32> inreg %value) {
 define <3 x i32> @v_sext_inreg_v3i32_16(<3 x i32> %value, <3 x i32> %amount) {
 ; GCN-LABEL: v_sext_inreg_v3i32_16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v3i32_16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX10PLUS-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <3 x i32> %value, <i32 16, i32 16, i32 16>
   %ashr = ashr <3 x i32> %shl, <i32 16, i32 16, i32 16>
@@ -329,20 +329,20 @@ define amdgpu_ps <3 x i32> @s_sext_inreg_v3i32_22(<3 x i32> inreg %value) {
 define <4 x i32> @v_sext_inreg_v4i32_6(<4 x i32> %value) {
 ; GCN-LABEL: v_sext_inreg_v4i32_6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 26
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 26
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 26
 ; GCN-NEXT:    v_bfe_i32 v3, v3, 0, 26
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v4i32_6:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 26
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 26
 ; GFX10PLUS-NEXT:    v_bfe_i32 v2, v2, 0, 26
 ; GFX10PLUS-NEXT:    v_bfe_i32 v3, v3, 0, 26
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <4 x i32> %value, <i32 6, i32 6, i32 6, i32 6>
   %ashr = ashr <4 x i32> %shl, <i32 6, i32 6, i32 6, i32 6>
@@ -373,22 +373,22 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v4i32_13(<4 x i32> inreg %value) {
 define <5 x i32> @v_sext_inreg_v5i32_30(<5 x i32> %value) {
 ; GCN-LABEL: v_sext_inreg_v5i32_30:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 2
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 2
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 2
 ; GCN-NEXT:    v_bfe_i32 v3, v3, 0, 2
 ; GCN-NEXT:    v_bfe_i32 v4, v4, 0, 2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v5i32_30:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 2
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 2
 ; GFX10PLUS-NEXT:    v_bfe_i32 v2, v2, 0, 2
 ; GFX10PLUS-NEXT:    v_bfe_i32 v3, v3, 0, 2
 ; GFX10PLUS-NEXT:    v_bfe_i32 v4, v4, 0, 2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <5 x i32> %value, <i32 30, i32 30, i32 30, i32 30, i32 30>
   %ashr = ashr <5 x i32> %shl, <i32 30, i32 30, i32 30, i32 30, i32 30>
@@ -421,7 +421,6 @@ define amdgpu_ps <5 x i32> @s_sext_inreg_v5i32_19(<5 x i32> inreg %value) {
 define <16 x i32> @v_sext_inreg_v16i32_27(<16 x i32> %value) {
 ; GCN-LABEL: v_sext_inreg_v16i32_27:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 5
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 5
 ; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 5
@@ -438,11 +437,11 @@ define <16 x i32> @v_sext_inreg_v16i32_27(<16 x i32> %value) {
 ; GCN-NEXT:    v_bfe_i32 v13, v13, 0, 5
 ; GCN-NEXT:    v_bfe_i32 v14, v14, 0, 5
 ; GCN-NEXT:    v_bfe_i32 v15, v15, 0, 5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v16i32_27:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 5
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v1, 0, 5
 ; GFX10PLUS-NEXT:    v_bfe_i32 v2, v2, 0, 5
@@ -459,6 +458,7 @@ define <16 x i32> @v_sext_inreg_v16i32_27(<16 x i32> %value) {
 ; GFX10PLUS-NEXT:    v_bfe_i32 v13, v13, 0, 5
 ; GFX10PLUS-NEXT:    v_bfe_i32 v14, v14, 0, 5
 ; GFX10PLUS-NEXT:    v_bfe_i32 v15, v15, 0, 5
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <16 x i32> %value, <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
   %ashr = ashr <16 x i32> %shl, <i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
@@ -513,27 +513,27 @@ define amdgpu_ps <16 x i32> @s_sext_inreg_v16i32_3(<16 x i32> inreg %value) {
 define i16 @v_sext_inreg_i16_4(i16 %value) {
 ; GFX6-LABEL: v_sext_inreg_i16_4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 12
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_i16_4:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_i16_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 12
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i16_4:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 12
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i16 %value, 4
   %ashr = ashr i16 %shl, 4
@@ -543,27 +543,27 @@ define i16 @v_sext_inreg_i16_4(i16 %value) {
 define i16 @v_sext_inreg_i16_15(i16 %value) {
 ; GFX6-LABEL: v_sext_inreg_i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 15, v0
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i16 %value, 15
   %ashr = ashr i16 %shl, 15
@@ -635,34 +635,34 @@ define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) {
 define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v2i16_8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v2i16_8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v2i16_8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i16_8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i16> %value, <i16 8, i16 8>
   %ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
@@ -672,34 +672,34 @@ define <2 x i16> @v_sext_inreg_v2i16_8(<2 x i16> %value) {
 define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v2i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v2i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 15, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v1
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v2i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i16> %value, <i16 15, i16 15>
   %ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
@@ -780,7 +780,6 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) {
 define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v4i16_3:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 13
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 13
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -793,11 +792,11 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v4i16_3:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 3, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -809,24 +808,25 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 3, v4
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v4i16_3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 3, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 3, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 3, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 3, v1 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v4i16_3:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, 3, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v1, 3, v1 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v0, 3, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v1, 3, v1 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <4 x i16> %value, <i16 3, i16 3, i16 3, i16 3>
   %ashr = ashr <4 x i16> %shl, <i16 3, i16 3, i16 3, i16 3>
@@ -952,7 +952,6 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) {
 define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v8i16_11:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 5
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 5
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -977,11 +976,11 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_v8i16_11:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 11
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 11, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1003,11 +1002,11 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v8
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_v8i16_11:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1]
@@ -1016,11 +1015,11 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 11, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 11, v2 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v3, 11, v3 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v8i16_11:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, 11, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v1, 11, v1 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v2, 11, v2 op_sel_hi:[0,1]
@@ -1029,6 +1028,7 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v1, 11, v1 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v2, 11, v2 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_pk_ashrrev_i16 v3, 11, v3 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <8 x i16> %value, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
   %ashr = ashr <8 x i16> %shl, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
@@ -1199,14 +1199,14 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) {
 define i64 @v_sext_inreg_i64_23(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_23:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v1, v0, 0, 9
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_23:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v0, 0, 9
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 23
   %ashr = ashr i64 %shl, 23
@@ -1216,16 +1216,16 @@ define i64 @v_sext_inreg_i64_23(i64 %value) {
 define i64 @v_sext_inreg_i64_40(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_40:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_40:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 40
   %ashr = ashr i64 %shl, 40
@@ -1235,16 +1235,16 @@ define i64 @v_sext_inreg_i64_40(i64 %value) {
 define i64 @v_sext_inreg_i64_63(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_63:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_63:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 63
   %ashr = ashr i64 %shl, 63
@@ -1254,16 +1254,16 @@ define i64 @v_sext_inreg_i64_63(i64 %value) {
 define i64 @v_sext_inreg_i64_33(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_33:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_33:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 33
   %ashr = ashr i64 %shl, 33
@@ -1273,16 +1273,16 @@ define i64 @v_sext_inreg_i64_33(i64 %value) {
 define i64 @v_sext_inreg_i64_32(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 32
   %ashr = ashr i64 %value, 32
@@ -1292,14 +1292,14 @@ define i64 @v_sext_inreg_i64_32(i64 %value) {
 define i64 @v_sext_inreg_i64_31(i64 %value) {
 ; GCN-LABEL: v_sext_inreg_i64_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v1, v0, 0, 1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v0, 0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 %value, 31
   %ashr = ashr i64 %shl, 31
@@ -1384,16 +1384,16 @@ define amdgpu_ps i64 @s_sext_inreg_i64_31(i64 inreg %value) {
 define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) {
 ; GCN-LABEL: v_sext_inreg_v2i64_16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX10PLUS-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i64> %value, <i64 16, i64 16>
   %ashr = ashr <2 x i64> %shl, <i64 16, i64 16>
@@ -1403,16 +1403,16 @@ define <2 x i64> @v_sext_inreg_v2i64_16(<2 x i64> %value) {
 define <2 x i64> @v_sext_inreg_v2i64_31(<2 x i64> %value) {
 ; GCN-LABEL: v_sext_inreg_v2i64_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v1, v0, 0, 1
 ; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_v2i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v0, 0, 1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v3, v2, 0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i64> %value, <i64 31, i64 31>
   %ashr = ashr <2 x i64> %shl, <i64 31, i64 31>
@@ -1439,7 +1439,6 @@ define amdgpu_ps <2 x i64> @s_sext_inreg_v2i64_30(<2 x i64> inreg %value) {
 define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX6-LABEL: v_sext_inreg_i65_22:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 22
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1450,11 +1449,11 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 10, v2
 ; GFX6-NEXT:    v_ashr_i64 v[2:3], v[2:3], 22
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_i65_22:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 22, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1465,11 +1464,11 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 10, v2
 ; GFX8-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_i65_22:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 22, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1479,11 +1478,11 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX9-NEXT:    v_bfe_u32 v1, v1, 0, 10
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 10, v1
 ; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i65_22:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 22, v[2:3]
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, 10, v1
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 0, v[0:1]
@@ -1493,6 +1492,7 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX10PLUS-NEXT:    v_lshl_or_b32 v1, v2, 10, v1
 ; GFX10PLUS-NEXT:    v_ashrrev_i64 v[2:3], 22, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 22
   %ashr = ashr i65 %shl, 22
@@ -1502,7 +1502,6 @@ define i65 @v_sext_inreg_i65_22(i65 %value) {
 define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-LABEL: v_sext_inreg_i65_33:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1510,11 +1509,11 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sext_inreg_i65_33:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1522,11 +1521,11 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sext_inreg_i65_33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
@@ -1534,11 +1533,11 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_sext_inreg_i65_33:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10PLUS-NEXT:    v_bfe_i32 v1, v2, 0, 1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
@@ -1546,6 +1545,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %value, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 7ad19a47970039c..7c45e4ea5383115 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -37,25 +37,25 @@ define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) {
 define i64 @v_shl_i64_zext_i32(i32 %x) {
 ; GCN-LABEL: v_shl_i64_zext_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %x, 1073741823
   %ext = zext i32 %and to i64
@@ -86,25 +86,25 @@ define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) {
 define i64 @v_shl_i64_sext_i32(i32 %x) {
 ; GCN-LABEL: v_shl_i64_sext_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_sext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_sext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x1fffffff, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %x, 536870911
   %ext = sext i32 %and to i64
@@ -135,41 +135,41 @@ define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) {
 define i64 @v_shl_i64_zext_i32_overflow(i32 %x) {
 ; GFX7-LABEL: v_shl_i64_zext_i32_overflow:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i64_zext_i32_overflow:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i64_zext_i32_overflow:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_zext_i32_overflow:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_zext_i32_overflow:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fffffff, v0
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %x, 2147483647
   %ext = zext i32 %and to i64
@@ -200,34 +200,34 @@ define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) {
 define i64 @v_shl_i64_sext_i32_overflow(i32 %x) {
 ; GFX7-LABEL: v_shl_i64_sext_i32_overflow:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX7-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i64_sext_i32_overflow:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i64_sext_i32_overflow:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i64_sext_i32_overflow:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %x, 2147483647
   %ext = sext i32 %and to i64
@@ -451,33 +451,33 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) {
 define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
 ; GCN-LABEL: v_shl_v2i64_zext_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0x3fffffff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_v2i64_zext_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x3fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_v2i64_zext_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x3fffffff, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and <2 x i32> %x, <i32 1073741823, i32 1073741823>
   %ext = zext <2 x i32> %and to <2 x i64>
@@ -516,33 +516,33 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) {
 define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 ; GCN-LABEL: v_shl_v2i64_sext_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0x1fffffff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_v2i64_sext_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x1fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_v2i64_sext_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x1fffffff, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 2, v1
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and <2 x i32> %x, <i32 536870911, i32 536870911>
   %ext = sext <2 x i32> %and to <2 x i64>
@@ -584,32 +584,32 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
 define i32 @v_shl_i32_zext_i16(i16 %x) {
 ; GFX7-LABEL: v_shl_i32_zext_i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i32_zext_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i32_zext_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i32_zext_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 2, v0
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %and = and i16 %x, 16383
   %ext = zext i16 %and to i32
@@ -677,7 +677,6 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX7-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -687,33 +686,34 @@ define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x3fff3fff, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 2, v1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 2, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i32_zext_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0x3fff3fff, v0
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v1, 2, v0 op_sel_hi:[0,1]
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %and = and <2 x i16> %x, <i16 16383, i16 16383>
   %ext = zext <2 x i16> %and to <2 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc4458792..8e1bba6e9279fff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -8,28 +8,28 @@
 define i8 @v_shl_i8(i8 %value, i8 %amount) {
 ; GFX6-LABEL: v_shl_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i8 %value, %amount
   ret i8 %result
@@ -38,26 +38,26 @@ define i8 @v_shl_i8(i8 %value, i8 %amount) {
 define i8 @v_shl_i8_7(i8 %value) {
 ; GFX6-LABEL: v_shl_i8_7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i8_7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 7, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i8_7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 7, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 7, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i8 %value, 7
   ret i8 %result
@@ -108,16 +108,16 @@ define amdgpu_ps i8 @s_shl_i8_7(i8 inreg %value) {
 define i24 @v_shl_i24(i24 %value, i24 %amount) {
 ; GCN-LABEL: v_shl_i24:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i24 %value, %amount
   ret i24 %result
@@ -126,14 +126,14 @@ define i24 @v_shl_i24(i24 %value, i24 %amount) {
 define i24 @v_shl_i24_7(i24 %value) {
 ; GCN-LABEL: v_shl_i24_7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i24_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i24 %value, 7
   ret i24 %result
@@ -170,14 +170,14 @@ define amdgpu_ps i24 @s_shl_i24_7(i24 inreg %value) {
 define i32 @v_shl_i32(i32 %value, i32 %amount) {
 ; GCN-LABEL: v_shl_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i32 %value, %amount
   ret i32 %result
@@ -186,14 +186,14 @@ define i32 @v_shl_i32(i32 %value, i32 %amount) {
 define i32 @v_shl_i32_31(i32 %value) {
 ; GCN-LABEL: v_shl_i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i32 %value, 31
   ret i32 %result
@@ -270,16 +270,16 @@ define amdgpu_ps float @shl_i32_vs(i32 %value, i32 inreg %amount) {
 define <2 x i32> @v_shl_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 ; GCN-LABEL: v_shl_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i32> %value, %amount
   ret <2 x i32> %result
@@ -288,16 +288,16 @@ define <2 x i32> @v_shl_v2i32(<2 x i32> %value, <2 x i32> %amount) {
 define <2 x i32> @v_shl_v2i32_31(<2 x i32> %value) {
 ; GCN-LABEL: v_shl_v2i32_31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i32_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i32> %value, <i32 31, i32 31>
   ret <2 x i32> %result
@@ -322,18 +322,18 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32(<2 x i32> inreg %value, <2 x i32> inreg
 define <3 x i32> @v_shl_v3i32(<3 x i32> %value, <3 x i32> %amount) {
 ; GCN-LABEL: v_shl_v3i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, v4, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v2, v5, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <3 x i32> %value, %amount
   ret <3 x i32> %result
@@ -360,20 +360,20 @@ define amdgpu_ps <3 x i32> @s_shl_v3i32(<3 x i32> inreg %value, <3 x i32> inreg
 define <4 x i32> @v_shl_v4i32(<4 x i32> %value, <4 x i32> %amount) {
 ; GCN-LABEL: v_shl_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, v5, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, v7, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <4 x i32> %value, %amount
   ret <4 x i32> %result
@@ -402,22 +402,22 @@ define amdgpu_ps <4 x i32> @s_shl_v4i32(<4 x i32> inreg %value, <4 x i32> inreg
 define <5 x i32> @v_shl_v5i32(<5 x i32> %value, <5 x i32> %amount) {
 ; GCN-LABEL: v_shl_v5i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, v6, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, v7, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, v8, v3
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, v9, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, v5, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, v6, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v2, v7, v2
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v3, v8, v3
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v4, v9, v4
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <5 x i32> %value, %amount
   ret <5 x i32> %result
@@ -448,7 +448,6 @@ define amdgpu_ps <5 x i32> @s_shl_v5i32(<5 x i32> inreg %value, <5 x i32> inreg
 define <16 x i32> @v_shl_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-LABEL: v_shl_v16i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, v16, v0
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, v17, v1
@@ -467,11 +466,11 @@ define <16 x i32> @v_shl_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GCN-NEXT:    v_lshlrev_b32_e32 v14, v30, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v15, v16, v15
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, v16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, v17, v1
@@ -490,11 +489,11 @@ define <16 x i32> @v_shl_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v14, v30, v14
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, v31, v15
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, v16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, v17, v1
@@ -513,6 +512,7 @@ define <16 x i32> @v_shl_v16i32(<16 x i32> %value, <16 x i32> %amount) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, v30, v14
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, v31, v15
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <16 x i32> %value, %amount
   ret <16 x i32> %result
@@ -565,27 +565,27 @@ define amdgpu_ps <16 x i32> @s_shl_v16i32(<16 x i32> inreg %value, <16 x i32> in
 define i16 @v_shl_i16(i16 %value, i16 %amount) {
 ; GFX6-LABEL: v_shl_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i16 %value, %amount
   ret i16 %result
@@ -594,26 +594,26 @@ define i16 @v_shl_i16(i16 %value, i16 %amount) {
 define i16 @v_shl_i16_15(i16 %value) {
 ; GFX6-LABEL: v_shl_i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 15, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 15, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 15, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 15, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i16 %value, 15
   ret i16 %result
@@ -715,31 +715,31 @@ define amdgpu_ps half @shl_i16_vs(i16 %value, i16 inreg %amount) {
 define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i16> %value, %amount
   ret <2 x i16> %result
@@ -748,30 +748,30 @@ define <2 x i16> @v_shl_v2i16(<2 x i16> %value, <2 x i16> %amount) {
 define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 ; GFX6-LABEL: v_shl_v2i16_15:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 15, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i16_15:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 15, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i16> %value, <i16 15, i16 15>
   ret <2 x i16> %result
@@ -908,7 +908,6 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
@@ -925,31 +924,32 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v2, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <4 x i16> %value, %amount
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -1051,7 +1051,6 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-LABEL: v_shl_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v8, v0
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v9
@@ -1084,11 +1083,11 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v8, v4, v0
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v4, v5, v1
@@ -1101,24 +1100,25 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v5, v1
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v2, v6, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v3, v7, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v0, v4, v0
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v1, v5, v1
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v2, v6, v2
 ; GFX10PLUS-NEXT:    v_pk_lshlrev_b16 v3, v7, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <8 x i16> %value, %amount
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -1245,26 +1245,26 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 define i64 @v_shl_i64(i64 %value, i64 %amount) {
 ; GFX6-LABEL: v_shl_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, %amount
   ret i64 %result
@@ -1273,22 +1273,22 @@ define i64 @v_shl_i64(i64 %value, i64 %amount) {
 define i64 @v_shl_i64_63(i64 %value) {
 ; GCN-LABEL: v_shl_i64_63:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_63:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_63:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 31, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 63
   ret i64 %result
@@ -1297,22 +1297,22 @@ define i64 @v_shl_i64_63(i64 %value) {
 define i64 @v_shl_i64_33(i64 %value) {
 ; GCN-LABEL: v_shl_i64_33:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 33
   ret i64 %result
@@ -1321,22 +1321,22 @@ define i64 @v_shl_i64_33(i64 %value) {
 define i64 @v_shl_i64_32(i64 %value) {
 ; GCN-LABEL: v_shl_i64_32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i64_32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 32
   ret i64 %result
@@ -1345,26 +1345,26 @@ define i64 @v_shl_i64_32(i64 %value) {
 define i64 @v_shl_i64_31(i64 %value) {
 ; GFX6-LABEL: v_shl_i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 31
   ret i64 %result
@@ -1499,30 +1499,30 @@ define amdgpu_ps <2 x float> @shl_i64_vs(i64 %value, i64 inreg %amount) {
 define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 ; GFX6-LABEL: v_shl_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i64> %value, %amount
   ret <2 x i64> %result
@@ -1531,30 +1531,30 @@ define <2 x i64> @v_shl_v2i64(<2 x i64> %value, <2 x i64> %amount) {
 define <2 x i64> @v_shl_v2i64_31(<2 x i64> %value) {
 ; GFX6-LABEL: v_shl_v2i64_31:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 31
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_v2i64_31:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 31, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_v2i64_31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 31, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_v2i64_31:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 31, v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl <2 x i64> %value, <i64 31, i64 31>
   ret <2 x i64> %result
@@ -1579,7 +1579,6 @@ define amdgpu_ps <2 x i64> @s_shl_v2i64(<2 x i64> inreg %value, <2 x i64> inreg
 define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX6-LABEL: v_shl_i65:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 64, v3
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], v4
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], v[2:3], v3
@@ -1593,11 +1592,11 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i65:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 64, v3
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
@@ -1611,11 +1610,11 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i65:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 64, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v3, v[2:3]
@@ -1629,11 +1628,11 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
@@ -1647,11 +1646,11 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 64, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v3, v[2:3]
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 64, v3
@@ -1664,6 +1663,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i65 %value, %amount
   ret i65 %result
@@ -1672,46 +1672,46 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) {
 define i65 @v_shl_i65_33(i65 %value) {
 ; GFX6-LABEL: v_shl_i65_33:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[0:1], 31
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl_i65_33:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 31, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i65_33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 31, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl_i65_33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 31, v[0:1]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i65_33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 31, v[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
index 7b7afd385af9b3d..a4667fa54e58df1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
@@ -104,21 +104,21 @@ define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) {
 define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
 ; GFX9-LABEL: v_shl1_add_u32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl1_add_u32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl1_add_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %src0, 1
   %add = add i32 %shl, %src1
@@ -128,21 +128,21 @@ define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
 define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
 ; GFX9-LABEL: v_shl2_add_u32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl2_add_u32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl2_add_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %src0, 2
   %add = add i32 %shl, %src1
@@ -152,21 +152,21 @@ define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
 define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
 ; GFX9-LABEL: v_shl3_add_u32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl3_add_u32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl3_add_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %src0, 3
   %add = add i32 %shl, %src1
@@ -176,21 +176,21 @@ define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
 define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
 ; GFX9-LABEL: v_shl4_add_u32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 4, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl4_add_u32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl4_add_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 4, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %src0, 4
   %add = add i32 %shl, %src1
@@ -200,21 +200,21 @@ define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
 define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
 ; GFX9-LABEL: v_shl5_add_u32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 5, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_shl5_add_u32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_shl5_add_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 5, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i32 %src0, 5
   %add = add i32 %shl, %src1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index dc13dee4f148ac3..cbf803bded3233c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -6,14 +6,14 @@
 define i32 @test_min_max_ValK0_K1_i32(i32 %a) {
 ; GFX89-LABEL: test_min_max_ValK0_K1_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_max_ValK0_K1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12)
   %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17)
@@ -23,14 +23,14 @@ define i32 @test_min_max_ValK0_K1_i32(i32 %a) {
 define i32 @min_max_ValK0_K1_i32(i32 %a) {
 ; GFX89-LABEL: min_max_ValK0_K1_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_max_ValK0_K1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a)
   %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17)
@@ -40,14 +40,14 @@ define i32 @min_max_ValK0_K1_i32(i32 %a) {
 define i32 @test_min_K1max_ValK0__i32(i32 %a) {
 ; GFX89-LABEL: test_min_K1max_ValK0__i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_K1max_ValK0__i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12)
   %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax)
@@ -57,14 +57,14 @@ define i32 @test_min_K1max_ValK0__i32(i32 %a) {
 define i32 @test_min_K1max_K0Val__i32(i32 %a) {
 ; GFX89-LABEL: test_min_K1max_K0Val__i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_K1max_K0Val__i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a)
   %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax)
@@ -74,14 +74,14 @@ define i32 @test_min_K1max_K0Val__i32(i32 %a) {
 define i32 @test_max_min_ValK1_K0_i32(i32 %a) {
 ; GFX89-LABEL: test_max_min_ValK1_K0_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_min_ValK1_K0_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 17)
   %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12)
@@ -91,14 +91,14 @@ define i32 @test_max_min_ValK1_K0_i32(i32 %a) {
 define i32 @test_max_min_K1Val_K0_i32(i32 %a) {
 ; GFX89-LABEL: test_max_min_K1Val_K0_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_min_K1Val_K0_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 17, i32 %a)
   %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12)
@@ -108,14 +108,14 @@ define i32 @test_max_min_K1Val_K0_i32(i32 %a) {
 define i32 @test_max_K0min_ValK1__i32(i32 %a) {
 ; GFX89-LABEL: test_max_K0min_ValK1__i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_ValK1__i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 17)
   %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin)
@@ -125,14 +125,14 @@ define i32 @test_max_K0min_ValK1__i32(i32 %a) {
 define i32 @test_max_K0min_K1Val__i32(i32 %a) {
 ; GFX89-LABEL: test_max_K0min_K1Val__i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_K1Val__i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 17, i32 %a)
   %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin)
@@ -142,7 +142,6 @@ define i32 @test_max_K0min_K1Val__i32(i32 %a) {
 define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) {
 ; GFX8-LABEL: test_max_K0min_K1Val__v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_i16_e32 v1, 17, v0
 ; GFX8-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -150,20 +149,21 @@ define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, -12, v1
 ; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_max_K0min_K1Val__v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_i16 v0, 17, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_max_i16 v0, -12, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_K1Val__v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_i16 v0, 17, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_max_i16 v0, -12, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> <i16 17, i16 17>, <2 x i16> %a)
   %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> <i16 -12, i16 -12>, <2 x i16> %smin)
@@ -190,15 +190,15 @@ define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) {
 define i32 @test_non_inline_constant_i32(i32 %a) {
 ; GFX89-LABEL: test_non_inline_constant_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mov_b32_e32 v1, 0x41
 ; GFX89-NEXT:    v_med3_i32 v0, v0, -12, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_non_inline_constant_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_i32 v0, v0, -12, 0x41
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12)
   %smed = call i32 @llvm.smin.i32(i32 %smax, i32 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 1bb606f36e48d2c..178d843de8a159a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -7,7 +7,6 @@
 define i32 @v_srem_i32(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_srem_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -33,11 +32,11 @@ define i32 @v_srem_i32(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -63,6 +62,7 @@ define i32 @v_srem_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, %den
   ret i32 %result
@@ -139,7 +139,6 @@ define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) {
 define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_srem_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -190,11 +189,11 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
@@ -245,6 +244,7 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -253,7 +253,6 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
@@ -276,6 +275,7 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 4096
   ret i32 %result
@@ -284,7 +284,6 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_srem_v2i32_pow2k_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
@@ -322,11 +321,11 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
@@ -363,6 +362,7 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -371,7 +371,6 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
@@ -394,6 +393,7 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i32 %num, 1235195
   ret i32 %result
@@ -402,7 +402,6 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_srem_v2i32_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
@@ -440,11 +439,11 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
@@ -481,6 +480,7 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
@@ -489,7 +489,6 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-LABEL: v_srem_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
@@ -516,6 +515,7 @@ define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
@@ -525,7 +525,6 @@ define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_srem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
@@ -578,11 +577,11 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
@@ -635,6 +634,7 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
@@ -644,7 +644,6 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_srem_i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
@@ -672,11 +671,11 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -696,6 +695,7 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i32 %num, 16777215
   %den.mask = and i32 %den, 16777215
@@ -706,7 +706,6 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_srem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -761,11 +760,11 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -804,6 +803,7 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
   %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 0b22b3b3a4ba7c6..36babf86ace318d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -7,13 +7,13 @@
 define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_srem_i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
@@ -22,6 +22,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
@@ -177,6 +178,7 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, %den
   ret i64 %result
@@ -367,7 +369,6 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_srem_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v5, v8, vcc
@@ -629,11 +630,11 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v11, v1
 ; CGP-NEXT:    v_mov_b32_e32 v10, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v11, v5
@@ -642,6 +643,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mov_b32_e32 v8, v2
 ; CGP-NEXT:    v_mov_b32_e32 v9, v3
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB2_2
@@ -813,6 +815,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB2_8
 ; CGP-NEXT:  .LBB2_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB2_7:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
@@ -968,6 +971,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -976,7 +980,6 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
@@ -1095,6 +1098,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1103,7 +1107,6 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
@@ -1335,11 +1338,11 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, 0xfffff000
@@ -1563,6 +1566,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1571,7 +1575,6 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
@@ -1690,6 +1693,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1698,7 +1702,6 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
@@ -1930,11 +1933,11 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, 0xffed2705
@@ -2158,6 +2161,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2166,7 +2170,6 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -2176,6 +2179,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_3
@@ -2184,6 +2188,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_4
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v6
@@ -2341,6 +2346,7 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
@@ -2350,7 +2356,6 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v8, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0
 ; GISEL-NEXT:    v_lshl_b64 v[4:5], v[8:9], v4
@@ -2616,11 +2621,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
@@ -2632,6 +2637,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
@@ -2808,6 +2814,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB8_8
 ; CGP-NEXT:  .LBB8_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB8_7:
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v10
@@ -2965,6 +2972,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y
@@ -2974,7 +2982,6 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_srem_i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
@@ -2995,11 +3002,11 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v2, v1
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
@@ -3016,6 +3023,7 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
   %den.mask = and i64 %den, 16777215
@@ -3026,7 +3034,6 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_srem_v2i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
@@ -3277,11 +3284,11 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; CGP-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
@@ -3314,6 +3321,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
   %den.mask = and <2 x i64> %den, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 61e1e67b7ae360e..58a5bed714903d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -8,7 +8,6 @@
 define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
@@ -19,11 +18,11 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
@@ -34,24 +33,25 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 9, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
@@ -118,7 +118,6 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
@@ -129,11 +128,11 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -144,24 +143,25 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -228,7 +228,6 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-LABEL: v_ssubsat_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
@@ -255,11 +254,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
@@ -283,11 +282,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -301,11 +300,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -319,11 +318,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -339,6 +338,7 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -499,7 +499,6 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-LABEL: v_ssubsat_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
@@ -558,11 +557,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
@@ -613,11 +612,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
@@ -645,11 +644,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -676,11 +675,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -707,6 +706,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -1000,7 +1000,6 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
@@ -1011,11 +1010,11 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -1026,24 +1025,25 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
   ret i24 %result
@@ -1107,7 +1107,6 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
@@ -1115,11 +1114,11 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
@@ -1127,18 +1126,19 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -1260,7 +1260,6 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
@@ -1275,11 +1274,11 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
@@ -1294,20 +1293,21 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -1374,7 +1374,6 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v0
@@ -1397,11 +1396,11 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6
 ; GFX8-NEXT:    v_min_i32_e32 v8, -1, v0
@@ -1424,22 +1423,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -1525,7 +1525,6 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
@@ -1556,11 +1555,11 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX8-NEXT:    v_min_i32_e32 v10, -1, v0
@@ -1591,24 +1590,25 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v7
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_sub_i32 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -1713,7 +1713,6 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
@@ -1751,11 +1750,11 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
@@ -1793,26 +1792,27 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v5i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v5 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v6 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v7 clamp
 ; GFX9-NEXT:    v_sub_i32 v3, v3, v8 clamp
 ; GFX9-NEXT:    v_sub_i32 v4, v4, v9 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v7 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, v3, v8 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v4, v4, v9 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
   ret <5 x i32> %result
@@ -1936,7 +1936,6 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_i32_e32 v32, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v31, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v31
@@ -2053,11 +2052,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i32_e32 v32, -1, v0
 ; GFX8-NEXT:    v_bfrev_b32_e32 v31, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v31
@@ -2174,11 +2173,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, v15, v16
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v17 clamp
@@ -2197,11 +2196,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_i32 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
@@ -2220,11 +2219,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
@@ -2243,6 +2242,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -2575,7 +2575,6 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2586,11 +2585,11 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
 ; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
@@ -2598,18 +2597,19 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -2748,7 +2748,6 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -2769,11 +2768,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v3, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x7fff, v3
 ; GFX8-NEXT:    v_min_i16_e32 v4, -1, v0
@@ -2790,18 +2789,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v3
 ; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -3034,7 +3034,6 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -3085,11 +3084,11 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x7fff, v6
 ; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
@@ -3122,20 +3121,21 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v7
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -3296,7 +3296,6 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v6i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
@@ -3371,11 +3370,11 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v9, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v9, 0x7fff, v9
 ; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
@@ -3424,22 +3423,23 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v11
 ; GFX8-NEXT:    v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v6i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
   %cast = bitcast <6 x i16> %result to <3 x float>
@@ -3648,7 +3648,6 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
 ; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
@@ -3747,11 +3746,11 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v12, -1, v0
 ; GFX8-NEXT:    v_subrev_u16_e32 v12, 0x7fff, v12
 ; GFX8-NEXT:    v_min_i16_e32 v13, -1, v0
@@ -3816,24 +3815,25 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v15
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -4101,7 +4101,6 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_bfe_i32 v5, v4, 0, 16
@@ -4115,11 +4114,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 16
@@ -4133,11 +4132,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
@@ -4150,11 +4149,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i48:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
@@ -4167,11 +4166,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_i48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
@@ -4183,6 +4182,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 16, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
@@ -4498,7 +4498,6 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4508,11 +4507,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4522,11 +4521,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
@@ -4536,11 +4535,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
@@ -4550,11 +4549,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
@@ -4563,6 +4562,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_add_co_u32 v1, null, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -4815,7 +4815,6 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
 ; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4835,11 +4834,11 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v0, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4859,11 +4858,11 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
@@ -4883,11 +4882,11 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
@@ -4906,11 +4905,11 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
 ; GFX11-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
@@ -4927,6 +4926,7 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s2, s1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -5674,7 +5674,6 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
 ; GFX6-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
@@ -5726,11 +5725,11 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v0, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
 ; GFX8-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
@@ -5782,11 +5781,11 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v16, vcc, v0, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
@@ -5838,11 +5837,11 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
@@ -5893,11 +5892,11 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v21, v7, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
@@ -5947,6 +5946,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v21, v7, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
index 15740ee5476e846..ff6869805f5e486 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
@@ -5,8 +5,8 @@
 define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -15,13 +15,12 @@ define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0
 define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
@@ -29,6 +28,7 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -37,14 +37,13 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
 define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
@@ -53,6 +52,7 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -61,14 +61,13 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
@@ -83,6 +82,7 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
@@ -91,8 +91,8 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg half %z
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -102,8 +102,8 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z
 define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg half %x
   %neg.y = fneg half %y
@@ -114,8 +114,8 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha
 define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call half @llvm.fabs.f16(half %x) #0
   %neg.y = call half @llvm.fabs.f16(half %y) #0
@@ -126,13 +126,12 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha
 define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -142,6 +141,7 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %
 ; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x half> %x
   %neg.y = fneg <2 x half> %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
index 5955c590e9d1cb7..0f14b9e749654cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
@@ -4,8 +4,8 @@
 define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -14,9 +14,9 @@ define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z)
 define <2 x float> @v_constained_fma_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -25,10 +25,10 @@ define <2 x float> @v_constained_fma_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <3 x float> @v_constained_fma_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y, <3 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float> %x, <3 x float> %y, <3 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -37,11 +37,11 @@ define <3 x float> @v_constained_fma_v3f32_fpexcept_strict(<3 x float> %x, <3 x
 define <4 x float> @v_constained_fma_v4f32_fpexcept_strict(<4 x float> %x, <4 x float> %y, <4 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v4f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x float> %val
@@ -50,8 +50,8 @@ define <4 x float> @v_constained_fma_v4f32_fpexcept_strict(<4 x float> %x, <4 x
 define float @v_constained_fma_f32_fpexcept_strict_fneg(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg float %z
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -61,8 +61,8 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg(float %x, float %y, floa
 define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %neg.y = fneg float %y
@@ -73,8 +73,8 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y,
 define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call float @llvm.fabs.f32(float %x) #0
   %neg.y = call float @llvm.fabs.f32(float %y) #0
@@ -85,9 +85,9 @@ define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y,
 define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, -v0, -v2, v4
 ; GCN-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x float> %x
   %neg.y = fneg <2 x float> %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
index 04a07c42c934c6b..b997f8d05ae4764 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
@@ -4,8 +4,8 @@
 define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %val
@@ -14,9 +14,9 @@ define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double
 define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -25,10 +25,10 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2
 define <3 x double> @v_constained_fma_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y, <3 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v3f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double> %x, <3 x double> %y, <3 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x double> %val
@@ -37,11 +37,11 @@ define <3 x double> @v_constained_fma_v3f64_fpexcept_strict(<3 x double> %x, <3
 define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4 x double> %y, <4 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v4f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GCN-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x double> %val
@@ -50,8 +50,8 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4
 define double @v_constained_fma_f64_fpexcept_strict_fneg(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg double %z
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -61,8 +61,8 @@ define double @v_constained_fma_f64_fpexcept_strict_fneg(double %x, double %y, d
 define double @v_constained_fma_f64_fpexcept_strict_fneg_fneg(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %neg.y = fneg double %y
@@ -73,8 +73,8 @@ define double @v_constained_fma_f64_fpexcept_strict_fneg_fneg(double %x, double
 define double @v_constained_fma_f64_fpexcept_strict_fabs_fabs(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call double @llvm.fabs.f64(double %x) #0
   %neg.y = call double @llvm.fabs.f64(double %y) #0
@@ -85,9 +85,9 @@ define double @v_constained_fma_f64_fpexcept_strict_fabs_fabs(double %x, double
 define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x double> %x
   %neg.y = fneg <2 x double> %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index aa7aa6b21a562f7..858209d9a256288 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -7,28 +7,28 @@
 define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; GFX9-LABEL: v_sub_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %a, %b
   ret <2 x i16> %sub
@@ -37,29 +37,29 @@ define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
 define <2 x i16> @v_sub_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-LABEL: v_sub_v2i16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
@@ -70,29 +70,29 @@ define <2 x i16> @v_sub_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) {
 define <2 x i16> @v_sub_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_sub_v2i16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg <2 x half> %b
   %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
@@ -103,30 +103,30 @@ define <2 x i16> @v_sub_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) {
 define <2 x i16> @v_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_fneg_lhs_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg <2 x half> %a
   %neg.b = fneg <2 x half> %b
@@ -139,30 +139,30 @@ define <2 x i16> @v_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) {
 define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc0ffc0
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 0xffc0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc0 op_sel_hi:[1,0]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %a, <i16 -64, i16 -64>
   ret <2 x i16> %sub
@@ -171,30 +171,30 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_lo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4ffc0
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX8-NEXT:    v_subrev_u16_e32 v1, 0xffc0, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_lo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_lo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0x4ffc0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %a, <i16 -64, i16 4>
   ret <2 x i16> %sub
@@ -203,30 +203,30 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
 define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 ; GFX9-LABEL: v_sub_v2i16_neg_inline_imm_hi:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffc00004
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
 ; GFX8-NEXT:    v_subrev_u16_e32 v2, 4, v0
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sub_v2i16_neg_inline_imm_hi:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_i16 v0, v0, 0xffc00004
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %a, <i16 4, i16 -64>
   ret <2 x i16> %sub
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
index f8e7e5ecd6260fb..4b8bd8c25be5f7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll
@@ -6,26 +6,26 @@
 define i32 @v_usubo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_usubo_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubo_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %sub = extractvalue {i32, i1} %usubo, 0
@@ -38,32 +38,32 @@ define i32 @v_usubo_i32(i32 %a, i32 %b) {
 define i64 @v_usubo_i64(i64 %a, i64 %b) {
 ; GFX7-LABEL: v_usubo_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubo_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %sub = extractvalue {i64, i1} %usubo, 0
@@ -76,7 +76,6 @@ define i64 @v_usubo_i64(i64 %a, i64 %b) {
 define i8 @v_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_usubo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -84,11 +83,11 @@ define i8 @v_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -96,15 +95,16 @@ define i8 @v_usubo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
   %sub = extractvalue {i8, i1} %usubo, 0
@@ -117,7 +117,6 @@ define i8 @v_usubo_i8(i8 %a, i8 %b) {
 define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_usubo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -125,11 +124,11 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -137,11 +136,11 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
@@ -149,6 +148,7 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i7, i1} @llvm.usub.with.overflow.i7(i7 %a, i7 %b)
   %sub = extractvalue {i7, i1} %usubo, 0
@@ -161,35 +161,35 @@ define i7 @v_usubo_i7(i7 %a, i7 %b) {
 define <2 x i32> @v_usubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-LABEL: v_usubo_v2i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubo_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {<2 x i32>, <2 x i1>} @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
   %sub = extractvalue {<2 x i32>, <2 x i1>} %usubo, 0
@@ -202,35 +202,35 @@ define <2 x i32> @v_usubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 define i32 @v_ssubo_i32(i32 %a, i32 %b) {
 ; GFX7-LABEL: v_ssubo_i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX7-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubo_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubo_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v2, v0
 ; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
   %sub = extractvalue {i32, i1} %ssubo, 0
@@ -243,7 +243,6 @@ define i32 @v_ssubo_i32(i32 %a, i32 %b) {
 define i64 @v_ssubo_i64(i64 %a, i64 %b) {
 ; GFX7-LABEL: v_ssubo_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; GFX7-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -252,11 +251,11 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubo_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -265,11 +264,11 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v0
 ; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v5, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubo_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -278,6 +277,7 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %sub = extractvalue {i64, i1} %ssubo, 0
@@ -290,7 +290,6 @@ define i64 @v_ssubo_i64(i64 %a, i64 %b) {
 define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-LABEL: v_ssubo_v2i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v5, vcc, v1, v3
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -303,11 +302,11 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubo_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -320,11 +319,11 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v4, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v5, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubo_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, v4, v0
@@ -337,6 +336,7 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v4, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {<2 x i32>, <2 x i1>} @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b)
   %sub = extractvalue {<2 x i32>, <2 x i1>} %ssubo, 0
@@ -349,7 +349,6 @@ define <2 x i32> @v_ssubo_v2i32(<2 x i32> %a, <2 x i32> %b) {
 define i8 @v_ssubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: v_ssubo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -359,11 +358,11 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -373,11 +372,11 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -385,6 +384,7 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
   %sub = extractvalue {i8, i1} %ssubo, 0
@@ -397,7 +397,6 @@ define i8 @v_ssubo_i8(i8 %a, i8 %b) {
 define i7 @v_ssubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: v_ssubo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -407,11 +406,11 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -421,11 +420,11 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -435,6 +434,7 @@ define i7 @v_ssubo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i7, i1} @llvm.ssub.with.overflow.i7(i7 %a, i7 %b)
   %sub = extractvalue {i7, i1} %ssubo, 0
@@ -549,7 +549,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 define i8 @s_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_usubo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -557,11 +556,11 @@ define i8 @s_usubo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_usubo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -569,15 +568,16 @@ define i8 @s_usubo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_usubo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cmp_ne_u32_sdwa s[4:5], v0, v0 src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
   %sub = extractvalue {i8, i1} %usubo, 0
@@ -590,7 +590,6 @@ define i8 @s_usubo_i8(i8 %a, i8 %b) {
 define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_usubo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -598,11 +597,11 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_usubo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
@@ -610,11 +609,11 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_usubo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
@@ -622,6 +621,7 @@ define i7 @s_usubo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %usubo = call {i7, i1} @llvm.usub.with.overflow.i7(i7 %a, i7 %b)
   %sub = extractvalue {i7, i1} %usubo, 0
@@ -805,7 +805,6 @@ define amdgpu_ps <2 x i32> @s_ssubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
 define i8 @s_ssubo_i8(i8 %a, i8 %b) {
 ; GFX7-LABEL: s_ssubo_i8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -815,11 +814,11 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_ssubo_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 8
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
@@ -829,11 +828,11 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_ssubo_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_lt_i32_sdwa s[4:5], sext(v2), sext(v0) src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -841,6 +840,7 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
   %sub = extractvalue {i8, i1} %ssubo, 0
@@ -853,7 +853,6 @@ define i8 @s_ssubo_i8(i8 %a, i8 %b) {
 define i7 @s_ssubo_i7(i7 %a, i7 %b) {
 ; GFX7-LABEL: s_ssubo_i7:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -863,11 +862,11 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) {
 ; GFX7-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: s_ssubo_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -877,11 +876,11 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) {
 ; GFX8-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_ssubo_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e32 v2, v0, v1
 ; GFX9-NEXT:    v_bfe_i32 v3, v2, 0, 7
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 7
@@ -891,6 +890,7 @@ define i7 @s_ssubo_i7(i7 %a, i7 %b) {
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %ssubo = call {i7, i1} @llvm.ssub.with.overflow.i7(i7 %a, i7 %b)
   %sub = extractvalue {i7, i1} %ssubo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
index 0cb346c35552c50..9bd7642fea77dfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll
@@ -83,16 +83,16 @@ define i16 @v_trunc_i128_to_i16(i128 %src) {
 define i32 @v_trunc_v2i32_to_v2i16(<2 x i32> %src) {
 ; GFX7-LABEL: v_trunc_v2i32_to_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_trunc_v2i32_to_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc <2 x i32> %src to <2 x i16>
   %cast = bitcast <2 x i16> %trunc to i32
@@ -137,21 +137,21 @@ define amdgpu_ps i32 @s_trunc_v2i32_to_v2i16(<2 x i32> inreg %src) {
 define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
 ; GFX7-LABEL: v_trunc_v4i32_to_v4i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_trunc_v4i32_to_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX8-NEXT:    v_mov_b32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc <4 x i32> %src to <4 x i16>
   %cast = bitcast <4 x i16> %trunc to <2 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 1821d29d4b050b6..cef3352c516b662 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -8,40 +8,40 @@
 define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
 ; GFX6-NEXT:    v_not_b32_e32 v2, v0
 ; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
 ; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
@@ -93,40 +93,40 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_not_b32_e32 v2, v0
 ; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -178,7 +178,6 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-LABEL: v_uaddsat_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
@@ -195,11 +194,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -210,11 +209,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -228,11 +227,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -246,11 +245,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uaddsat_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -266,6 +265,7 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -391,7 +391,6 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-LABEL: v_uaddsat_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
@@ -426,11 +425,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
@@ -457,11 +456,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
@@ -489,11 +488,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -520,11 +519,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uaddsat_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -551,6 +550,7 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -775,40 +775,40 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_not_b32_e32 v2, v0
 ; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
   ret i24 %result
@@ -860,28 +860,28 @@ define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v2, v0
 ; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -975,34 +975,34 @@ define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v4, v0
 ; GFX6-NEXT:    v_min_u32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v1
 ; GFX6-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v3 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -1053,7 +1053,6 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v6, v0
 ; GFX6-NEXT:    v_min_u32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
@@ -1063,30 +1062,31 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_not_b32_e32 v3, v2
 ; GFX6-NEXT:    v_min_u32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v3 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v4 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v5 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -1148,7 +1148,6 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v8, v0
 ; GFX6-NEXT:    v_min_u32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
@@ -1161,33 +1160,34 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_not_b32_e32 v4, v3
 ; GFX6-NEXT:    v_min_u32_e32 v4, v4, v7
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v4 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v5 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v6 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v7 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -1260,7 +1260,6 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v10, v0
 ; GFX6-NEXT:    v_min_u32_e32 v5, v10, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
@@ -1276,36 +1275,37 @@ define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_not_b32_e32 v5, v4
 ; GFX6-NEXT:    v_min_u32_e32 v5, v5, v9
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v5 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v6 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v7 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v8 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v9 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v5i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v5 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v6 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v7 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v3, v3, v8 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v4, v4, v9 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v0, v0, v5 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v1, v1, v6 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v2, v2, v7 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v3, v3, v8 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_u32_e64 v4, v4, v9 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
   ret <5 x i32> %result
@@ -1389,7 +1389,6 @@ define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v31, v0
 ; GFX6-NEXT:    v_min_u32_e32 v16, v31, v16
 ; GFX6-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -1440,11 +1439,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v16, v16, v31
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v16 clamp
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v17 clamp
@@ -1463,11 +1462,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30 clamp
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v15, s[4:5], v15, v16 clamp
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v17 clamp
@@ -1486,11 +1485,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_add_u32_e64 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v17 clamp
@@ -1509,11 +1508,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_add_nc_u32_e64 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uaddsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_add_nc_u32_e64 v1, v1, v17 clamp
@@ -1532,6 +1531,7 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_add_nc_u32_e64 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e64 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -1736,31 +1736,31 @@ define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_not_b32_e32 v2, v0
 ; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -1863,7 +1863,6 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_not_b32_e32 v4, v0
@@ -1876,26 +1875,27 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v2, v0, v1 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -2044,7 +2044,6 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_not_b32_e32 v8, v0
@@ -2069,31 +2068,32 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v4, v0, v2 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v2, v1, v3 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -2191,7 +2191,6 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v6i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_not_b32_e32 v12, v0
@@ -2228,11 +2227,11 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v6, v0, v3 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v3, v1, v4 clamp
@@ -2242,22 +2241,23 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v6i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
   %cast = bitcast <6 x i16> %result to <3 x float>
@@ -2372,7 +2372,6 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX6-NEXT:    v_not_b32_e32 v16, v0
@@ -2421,11 +2420,11 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
 ; GFX6-NEXT:    v_alignbit_b32 v3, v7, v6, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v8, v0, v4 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v4, v1, v5 clamp
@@ -2438,24 +2437,25 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_u16 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -2598,7 +2598,6 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -2611,11 +2610,11 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
@@ -2623,11 +2622,11 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
@@ -2635,11 +2634,11 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i48:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
@@ -2647,6 +2646,7 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
@@ -2826,38 +2826,38 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_uaddsat_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10PLUS-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -2978,7 +2978,6 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
@@ -2987,11 +2986,11 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
@@ -3000,11 +2999,11 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
@@ -3013,11 +3012,11 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, s4, v2, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
@@ -3026,11 +3025,11 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uaddsat_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_add_co_u32 v2, s0, v2, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
@@ -3039,6 +3038,7 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -3259,7 +3259,6 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
@@ -3276,11 +3275,11 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
@@ -3297,11 +3296,11 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v10, vcc
@@ -3318,11 +3317,11 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
 ; GFX10-NEXT:    v_add_co_u32 v4, s4, v4, v12
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
@@ -3339,11 +3338,11 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, -1, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, -1, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uaddsat_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
 ; GFX11-NEXT:    v_add_co_u32 v4, s0, v4, v12
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
@@ -3360,6 +3359,7 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, -1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, -1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll
index 9fb359b768a5e50..2ca475e7dba126c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ubfx.ll
@@ -7,8 +7,8 @@
 define i32 @v_srl_mask_i32(i32 %value) {
 ; GCN-LABEL: v_srl_mask_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = lshr i32 %value, 8
  %2 = and i32 %1, 31
@@ -41,8 +41,8 @@ define amdgpu_ps i32 @s_srl_big_mask_i32(i32 inreg %value) {
 define i32 @v_mask_srl_i32(i32 %value) {
 ; GCN-LABEL: v_mask_srl_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = and i32 %value, 7936 ; 31 << 8
  %2 = lshr i32 %1, 8
@@ -64,10 +64,10 @@ define amdgpu_ps i32 @s_mask_srl_i32(i32 inreg %value) {
 define i64 @v_srl_mask_i64(i64 %value) {
 ; GCN-LABEL: v_srl_mask_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b64 v[0:1], 25, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 10
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = lshr i64 %value, 25
  %2 = and i64 %1, 1023
@@ -103,10 +103,10 @@ define amdgpu_ps i64 @s_srl_big_mask_i64(i64 inreg %value) {
 define i64 @v_mask_srl_i64(i64 %value) {
 ; GCN-LABEL: v_mask_srl_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b64 v[0:1], 25, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, 10
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %1 = and i64 %value, 34326183936 ; 1023 << 25
  %2 = lshr i64 %1, 25
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index 6588112973f4c94..a611e12235133ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -7,7 +7,6 @@
 define i32 @v_udiv_i32(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_udiv_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -27,11 +26,11 @@ define i32 @v_udiv_i32(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -51,6 +50,7 @@ define i32 @v_udiv_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i32 %num, %den
   ret i32 %result
@@ -115,7 +115,6 @@ define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
 define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_udiv_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -154,11 +153,11 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -197,6 +196,7 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -205,8 +205,8 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_udiv_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i32 %num, 4096
   ret i32 %result
@@ -215,9 +215,9 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
 define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 12, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -226,13 +226,13 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
 define i32 @v_udiv_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_udiv_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xb2a50881
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i32 %num, 1235195
   ret i32 %result
@@ -241,7 +241,6 @@ define i32 @v_udiv_i32_oddk_denom(i32 %num) {
 define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
@@ -253,6 +252,7 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
@@ -261,7 +261,6 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
 define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-LABEL: v_udiv_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
@@ -282,6 +281,7 @@ define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
@@ -291,7 +291,6 @@ define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
 define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
@@ -332,11 +331,11 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
@@ -377,6 +376,7 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -386,7 +386,6 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_udiv_i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -408,11 +407,11 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -434,6 +433,7 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i32 %num, 16777215
   %den.mask = and i32 %den, 16777215
@@ -444,7 +444,6 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_udiv_v2i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -487,11 +486,11 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -534,6 +533,7 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
   %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 3add708d1a6394d..0e90aafa582eb11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -7,7 +7,6 @@
 define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_udiv_i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v1
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
@@ -15,6 +14,7 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
@@ -23,6 +23,7 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v3
@@ -175,6 +176,7 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %num, %den
   ret i64 %result
@@ -364,7 +366,6 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_udiv_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
@@ -615,11 +616,11 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v3, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v12, v5, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v10, v0
 ; CGP-NEXT:    v_mov_b32_e32 v11, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v2
@@ -629,6 +630,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB2_2
@@ -798,6 +800,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB2_8
 ; CGP-NEXT:  .LBB2_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB2_7:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v7
@@ -950,6 +953,7 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -958,8 +962,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 define i64 @v_udiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_udiv_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %num, 4096
   ret i64 %result
@@ -968,9 +972,9 @@ define i64 @v_udiv_i64_pow2k_denom(i64 %num) {
 define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CHECK-LABEL: v_udiv_v2i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 12
 ; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -979,7 +983,6 @@ define <2 x i64> @v_udiv_v2i64_pow2k_denom(<2 x i64> %num) {
 define i64 @v_udiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_udiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1fb03c31
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xd9528440
 ; CHECK-NEXT:    v_mul_lo_u32 v4, v1, v2
@@ -1004,6 +1007,7 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %num, 1235195
   ret i64 %result
@@ -1012,7 +1016,6 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
 define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CHECK-LABEL: v_udiv_v2i64_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1fb03c31
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0xd9528440
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
@@ -1059,6 +1062,7 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 20
 ; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -1067,7 +1071,6 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
 define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_udiv_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -1078,6 +1081,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_3
@@ -1086,6 +1090,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_4
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v6
@@ -1238,6 +1243,7 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = udiv i64 %x, %shl.y
@@ -1247,7 +1253,6 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
@@ -1502,11 +1507,11 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v3, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v12, v5, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
@@ -1519,6 +1524,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
@@ -1690,6 +1696,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB8_8
 ; CGP-NEXT:  .LBB8_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB8_7:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
@@ -1842,6 +1849,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y
@@ -1851,7 +1859,6 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_udiv_i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -1874,11 +1881,11 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v0
@@ -1893,6 +1900,7 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
   %den.mask = and i64 %den, 16777215
@@ -1903,7 +1911,6 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
 define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_udiv_v2i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
@@ -2149,11 +2156,11 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
@@ -2182,6 +2189,7 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
   %den.mask = and <2 x i64> %den, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index 7e38762e7b559ca..075478ac29fdc95 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -6,14 +6,14 @@
 define i32 @test_min_max_ValK0_K1_u32(i32 %a) {
 ; GFX89-LABEL: test_min_max_ValK0_K1_u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_max_ValK0_K1_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 12)
   %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17)
@@ -23,14 +23,14 @@ define i32 @test_min_max_ValK0_K1_u32(i32 %a) {
 define i32 @min_max_ValK0_K1_i32(i32 %a) {
 ; GFX89-LABEL: min_max_ValK0_K1_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_max_ValK0_K1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 12, i32 %a)
   %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17)
@@ -40,14 +40,14 @@ define i32 @min_max_ValK0_K1_i32(i32 %a) {
 define i32 @test_min_K1max_ValK0__u32(i32 %a) {
 ; GFX89-LABEL: test_min_K1max_ValK0__u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_K1max_ValK0__u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 12)
   %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax)
@@ -57,14 +57,14 @@ define i32 @test_min_K1max_ValK0__u32(i32 %a) {
 define i32 @test_min_K1max_K0Val__u32(i32 %a) {
 ; GFX89-LABEL: test_min_K1max_K0Val__u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_min_K1max_K0Val__u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 12, i32 %a)
   %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax)
@@ -74,14 +74,14 @@ define i32 @test_min_K1max_K0Val__u32(i32 %a) {
 define i32 @test_max_min_ValK1_K0_u32(i32 %a) {
 ; GFX89-LABEL: test_max_min_ValK1_K0_u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_min_ValK1_K0_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 17)
   %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12)
@@ -91,14 +91,14 @@ define i32 @test_max_min_ValK1_K0_u32(i32 %a) {
 define i32 @test_max_min_K1Val_K0_u32(i32 %a) {
 ; GFX89-LABEL: test_max_min_K1Val_K0_u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_min_K1Val_K0_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 17, i32 %a)
   %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12)
@@ -108,14 +108,14 @@ define i32 @test_max_min_K1Val_K0_u32(i32 %a) {
 define i32 @test_max_K0min_ValK1__u32(i32 %a) {
 ; GFX89-LABEL: test_max_K0min_ValK1__u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_ValK1__u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 17)
   %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin)
@@ -125,14 +125,14 @@ define i32 @test_max_K0min_ValK1__u32(i32 %a) {
 define i32 @test_max_K0min_K1Val__u32(i32 %a) {
 ; GFX89-LABEL: test_max_K0min_K1Val__u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_K1Val__u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 17
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 17, i32 %a)
   %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin)
@@ -142,7 +142,6 @@ define i32 @test_max_K0min_K1Val__u32(i32 %a) {
 define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) {
 ; GFX8-LABEL: test_max_K0min_K1Val__v2u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_u16_e32 v1, 17, v0
 ; GFX8-NEXT:    v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -150,20 +149,21 @@ define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) {
 ; GFX8-NEXT:    v_max_u16_e32 v1, 12, v1
 ; GFX8-NEXT:    v_max_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_max_K0min_K1Val__v2u16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_u16 v0, 17, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_max_u16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_max_K0min_K1Val__v2u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_u16 v0, 17, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_max_u16 v0, 12, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umin = call <2 x i16> @llvm.umin.v2i16(<2 x i16> <i16 17, i16 17>, <2 x i16> %a)
   %umed = call <2 x i16> @llvm.umax.v2i16(<2 x i16> <i16 12, i16 12>, <2 x i16> %umin)
@@ -190,15 +190,15 @@ define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) {
 define i32 @test_non_inline_constant_u32(i32 %a) {
 ; GFX89-LABEL: test_non_inline_constant_u32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mov_b32_e32 v1, 0x41
 ; GFX89-NEXT:    v_med3_u32 v0, v0, 12, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_non_inline_constant_u32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_med3_u32 v0, v0, 12, 0x41
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 12)
   %umed = call i32 @llvm.umin.i32(i32 %umax, i32 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 158403644607abe..1012950253695f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -7,7 +7,6 @@
 define i32 @v_urem_i32(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_urem_i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -25,11 +24,11 @@ define i32 @v_urem_i32(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; CGP-NEXT:    v_rcp_f32_e32 v2, v2
@@ -47,6 +46,7 @@ define i32 @v_urem_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, %den
   ret i32 %result
@@ -107,7 +107,6 @@ define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) {
 define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_urem_v2i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -142,11 +141,11 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i32:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
@@ -181,6 +180,7 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -189,8 +189,8 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 define i32 @v_urem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 4096
   ret i32 %result
@@ -199,9 +199,9 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) {
 define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CHECK-LABEL: v_urem_v2i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0xfff, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 4096, i32 4096>
   ret <2 x i32> %result
@@ -210,7 +210,6 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) {
 define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
@@ -228,6 +227,7 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 1235195
   ret i32 %result
@@ -236,7 +236,6 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_urem_v2i32_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
@@ -264,11 +263,11 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0xffed2705
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
@@ -295,6 +294,7 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
   ret <2 x i32> %result
@@ -303,7 +303,6 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-LABEL: v_urem_i32_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_b32_e32 v1, 0x1000, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
@@ -322,6 +321,7 @@ define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 ; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
@@ -331,7 +331,6 @@ define i32 @v_urem_i32_pow2_shl_denom(i32 %x, i32 %y) {
 define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-LABEL: v_urem_v2i32_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
@@ -368,11 +367,11 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i32_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_lshl_b32_e32 v2, 0x1000, v2
 ; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
@@ -409,6 +408,7 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
@@ -418,7 +418,6 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-LABEL: v_urem_i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -438,11 +437,11 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -462,6 +461,7 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i32 %num, 16777215
   %den.mask = and i32 %den, 16777215
@@ -472,7 +472,6 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-LABEL: v_urem_v2i32_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -511,11 +510,11 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i32_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -554,6 +553,7 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
   %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 12df4b7c7fc33d7..cbcb544eaf72251 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -7,7 +7,6 @@
 define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-LABEL: v_urem_i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v1
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v3
@@ -15,6 +14,7 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_3
@@ -23,6 +23,7 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v3
@@ -172,6 +173,7 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, %den
   ret i64 %result
@@ -358,7 +360,6 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_urem_v2i64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
@@ -607,11 +608,11 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v10, v0
 ; CGP-NEXT:    v_mov_b32_e32 v11, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v2
@@ -621,6 +622,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB2_2
@@ -787,6 +789,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB2_8
 ; CGP-NEXT:  .LBB2_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB2_7:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v7
@@ -936,6 +939,7 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, %den
   ret <2 x i64> %result
@@ -944,9 +948,9 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 define i64 @v_urem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 4096
   ret i64 %result
@@ -955,11 +959,11 @@ define i64 @v_urem_i64_pow2k_denom(i64 %num) {
 define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CHECK-LABEL: v_urem_v2i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0xfff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0xfff, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -968,7 +972,6 @@ define <2 x i64> @v_urem_v2i64_pow2k_denom(<2 x i64> %num) {
 define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
@@ -1085,6 +1088,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %num, 1235195
   ret i64 %result
@@ -1093,7 +1097,6 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_urem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
@@ -1320,11 +1323,11 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
@@ -1488,6 +1491,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -1496,7 +1500,6 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_urem_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
@@ -1507,6 +1510,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_3
@@ -1515,6 +1519,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_cbranch_execnz .LBB7_4
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v6
@@ -1664,6 +1669,7 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl i64 4096, %y
   %r = urem i64 %x, %shl.y
@@ -1673,7 +1679,6 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v10, 0
 ; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
@@ -1926,11 +1931,11 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
@@ -1943,6 +1948,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CGP-NEXT:    s_waitcnt expcnt(0)
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
@@ -2111,6 +2117,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    s_cbranch_execnz .LBB8_8
 ; CGP-NEXT:  .LBB8_6:
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
 ; CGP-NEXT:  .LBB8_7:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v10
@@ -2260,6 +2267,7 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y
@@ -2269,7 +2277,6 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-LABEL: v_urem_i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v2, v1
@@ -2290,11 +2297,11 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
@@ -2311,6 +2318,7 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
   %den.mask = and i64 %den, 16777215
@@ -2321,7 +2329,6 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
 define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-LABEL: v_urem_v2i64_24bit:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v0
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
@@ -2563,11 +2570,11 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_24bit:
 ; CGP:       ; %bb.0:
-; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; CGP-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v4
@@ -2600,6 +2607,7 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:    v_mov_b32_e32 v3, 0
+; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>
   %den.mask = and <2 x i64> %den, <i64 16777215, i64 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index a60370cd460f9ee..78cf513ccbe9624 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -8,39 +8,39 @@
 define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
 ; GFX6-LABEL: v_usubsat_i7:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
 ; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i7:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 9, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 9, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 9, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
   ret i7 %result
@@ -91,39 +91,39 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_usubsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -174,7 +174,6 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
 define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-LABEL: v_usubsat_v2i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
@@ -189,11 +188,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -204,11 +203,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -222,11 +221,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -240,11 +239,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -260,6 +259,7 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i16 %lhs.arg to <2 x i8>
   %rhs = bitcast i16 %rhs.arg to <2 x i8>
@@ -383,7 +383,6 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-LABEL: v_usubsat_v4i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
@@ -414,11 +413,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
@@ -445,11 +444,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
@@ -477,11 +476,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -508,11 +507,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v0
@@ -539,6 +538,7 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
   %rhs = bitcast i32 %rhs.arg to <4 x i8>
@@ -759,39 +759,39 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) {
 ; GFX6-LABEL: v_usubsat_i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i24:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs)
   ret i24 %result
@@ -842,27 +842,27 @@ define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_usubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -953,32 +953,32 @@ define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v2, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_min_u32_e32 v2, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -1027,37 +1027,37 @@ define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre
 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v3, v0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_min_u32_e32 v3, v1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_min_u32_e32 v3, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -1116,7 +1116,6 @@ define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre
 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v4, v0, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_min_u32_e32 v4, v1, v5
@@ -1125,33 +1124,34 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_min_u32_e32 v4, v3, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -1220,7 +1220,6 @@ define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre
 define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v5i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v5, v0, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_min_u32_e32 v5, v1, v6
@@ -1231,36 +1230,37 @@ define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_min_u32_e32 v5, v4, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v5i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v5 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v6 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v7 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v8 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v4, v9 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v5i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v5 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v6 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v7 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v8 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v4, v4, v9 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v5i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v7 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v8 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v4, v4, v9 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
   ret <5 x i32> %result
@@ -1339,7 +1339,6 @@ define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre
 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v16, v0, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
 ; GFX6-NEXT:    v_min_u32_e32 v16, v1, v17
@@ -1374,11 +1373,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_min_u32_e32 v16, v15, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
@@ -1397,11 +1396,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
@@ -1420,11 +1419,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
@@ -1443,11 +1442,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
@@ -1466,6 +1465,7 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -1654,30 +1654,30 @@ define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32>
 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_usubsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_u32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -1777,7 +1777,6 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_min_u32_e32 v2, v0, v2
@@ -1788,26 +1787,27 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v2, v0, v1 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -1950,7 +1950,6 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_min_u32_e32 v4, v0, v4
@@ -1971,31 +1970,32 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v4, v0, v2 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v2, v1, v3 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -2089,7 +2089,6 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v6i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX6-NEXT:    v_min_u32_e32 v6, v0, v6
@@ -2120,11 +2119,11 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v6i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v6, v0, v3 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v3, v1, v4 clamp
@@ -2134,22 +2133,23 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v6i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v6i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
   %cast = bitcast <6 x i16> %result to <3 x float>
@@ -2258,7 +2258,6 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX6-NEXT:    v_min_u32_e32 v8, v0, v8
@@ -2299,11 +2298,11 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX6-NEXT:    v_alignbit_b32 v2, v5, v4, 16
 ; GFX6-NEXT:    v_alignbit_b32 v3, v7, v6, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v8, v0, v4 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v4, v1, v5 clamp
@@ -2316,24 +2315,25 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v8i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v8i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
   %cast = bitcast <8 x i16> %result to <4 x float>
@@ -2468,7 +2468,6 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-LABEL: v_usubsat_i48:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
@@ -2481,11 +2480,11 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i48:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
@@ -2493,11 +2492,11 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
@@ -2505,11 +2504,11 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i48:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 16, v[0:1]
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[2:3], 16, v[2:3]
 ; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
@@ -2517,6 +2516,7 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
 ; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 16, v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs)
   ret i48 %result
@@ -2694,38 +2694,38 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_usubsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
 ; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
@@ -2846,7 +2846,6 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
 define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -2855,11 +2854,11 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -2868,11 +2867,11 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -2881,11 +2880,11 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_sub_co_u32 v2, s4, v2, v6
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
@@ -2894,11 +2893,11 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v4
 ; GFX11-NEXT:    v_sub_co_u32 v2, s0, v2, v6
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
@@ -2907,6 +2906,7 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
   ret <2 x i64> %result
@@ -3127,7 +3127,6 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
@@ -3144,11 +3143,11 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v10, vcc
@@ -3165,11 +3164,11 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v10, vcc
@@ -3186,11 +3185,11 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
 ; GFX10-NEXT:    v_sub_co_u32 v4, s4, v4, v12
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
@@ -3207,11 +3206,11 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v8
 ; GFX11-NEXT:    v_sub_co_u32 v4, s0, v4, v12
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
@@ -3228,6 +3227,7 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
index 7c9e2a51a8d2e33..14cb1d76d09435c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
@@ -9,14 +9,14 @@
 define i32 @check_v_bfe(i16 %a) {
 ; PREGFX9-LABEL: check_v_bfe:
 ; PREGFX9:       ; %bb.0: ; %entry
-; PREGFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; PREGFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; PREGFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; PREGFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: check_v_bfe:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = sext i16 %a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index ebad63e76a009c1..2261da198afeaf3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -214,16 +214,16 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
 ; GCN-LABEL: vector_xnor_i32_one_use:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_not_b32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: vector_xnor_i32_one_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_not_b32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %xor = xor i32 %a, %b
@@ -234,20 +234,20 @@ entry:
 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
 ; GCN-LABEL: vector_xnor_i64_one_use:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GCN-NEXT:    v_not_b32_e32 v0, v0
 ; GCN-NEXT:    v_not_b32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: vector_xnor_i64_one_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_not_b32_e32 v0, v0
 ; GFX10-NEXT:    v_not_b32_e32 v1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %xor = xor i64 %a, %b
@@ -399,35 +399,35 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
 ; GFX7-LABEL: vector_xor_na_b_i32_one_use:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_not_b32_e32 v0, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: vector_xor_na_b_i32_one_use:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: vector_xor_na_b_i32_one_use:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_not_b32_e32 v0, v0
 ; GFX900-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: vector_xor_na_b_i32_one_use:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xnor_b32_e32 v0, v0, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: vector_xor_na_b_i32_one_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor3_b32 v0, v0, -1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %na = xor i32 %a, -1
@@ -438,35 +438,35 @@ entry:
 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
 ; GFX7-LABEL: vector_xor_a_nb_i32_one_use:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: vector_xor_a_nb_i32_one_use:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: vector_xor_a_nb_i32_one_use:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_not_b32_e32 v1, v1
 ; GFX900-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: vector_xor_a_nb_i32_one_use:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_xnor_b32_e32 v0, v1, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: vector_xor_a_nb_i32_one_use:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor3_b32 v0, v1, -1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %nb = xor i32 %b, -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
index 9cd9c4734fbe692..8be22d3b1e095f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
@@ -8,45 +8,45 @@
 define i32 @zextload_global_i1_to_i32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i1_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i1_to_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i1_to_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i1_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i1_to_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i1, ptr addrspace(1) %ptr
   %ext = zext i1 %load to i32
@@ -56,40 +56,35 @@ define i32 @zextload_global_i1_to_i32(ptr addrspace(1) %ptr) {
 define i32 @zextload_global_i8_to_i32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i8_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i8_to_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i8_to_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i8_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i8_to_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i8, ptr addrspace(1) %ptr
   %ext = zext i8 %load to i32
@@ -99,40 +94,35 @@ define i32 @zextload_global_i8_to_i32(ptr addrspace(1) %ptr) {
 define i32 @zextload_global_i16_to_i32(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i16_to_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i16_to_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i16_to_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i16_to_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i16_to_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i16, ptr addrspace(1) %ptr
   %ext = zext i16 %load to i32
@@ -142,25 +132,24 @@ define i32 @zextload_global_i16_to_i32(ptr addrspace(1) %ptr) {
 define i64 @zextload_global_i1_to_i64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i1_to_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i1_to_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i1_to_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -168,23 +157,24 @@ define i64 @zextload_global_i1_to_i64(ptr addrspace(1) %ptr) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i1_to_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i1_to_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i1, ptr addrspace(1) %ptr
   %ext = zext i1 %load to i64
@@ -194,45 +184,40 @@ define i64 @zextload_global_i1_to_i64(ptr addrspace(1) %ptr) {
 define i64 @zextload_global_i8_to_i64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i8_to_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i8_to_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i8_to_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i8_to_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i8_to_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i8, ptr addrspace(1) %ptr
   %ext = zext i8 %load to i64
@@ -242,45 +227,40 @@ define i64 @zextload_global_i8_to_i64(ptr addrspace(1) %ptr) {
 define i64 @zextload_global_i16_to_i64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i16_to_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i16_to_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i16_to_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i16_to_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i16_to_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i16, ptr addrspace(1) %ptr
   %ext = zext i16 %load to i64
@@ -290,45 +270,40 @@ define i64 @zextload_global_i16_to_i64(ptr addrspace(1) %ptr) {
 define i64 @zextload_global_i32_to_i64(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i32_to_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i32_to_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i32_to_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i32_to_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i32_to_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i32, ptr addrspace(1) %ptr
   %ext = zext i32 %load to i64
@@ -338,49 +313,44 @@ define i64 @zextload_global_i32_to_i64(ptr addrspace(1) %ptr) {
 define i96 @zextload_global_i32_to_i96(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i32_to_i96:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i32_to_i96:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i32_to_i96:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i32_to_i96:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i32_to_i96:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i32, ptr addrspace(1) %ptr
   %ext = zext i32 %load to i96
@@ -390,27 +360,24 @@ define i96 @zextload_global_i32_to_i96(ptr addrspace(1) %ptr) {
 define i128 @zextload_global_i32_to_i128(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: zextload_global_i32_to_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: zextload_global_i32_to_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: zextload_global_i32_to_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
@@ -418,26 +385,24 @@ define i128 @zextload_global_i32_to_i128(ptr addrspace(1) %ptr) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zextload_global_i32_to_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zextload_global_i32_to_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load i32, ptr addrspace(1) %ptr
   %ext = zext i32 %load to i128
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index c7bc584d42d5ad3..b68de3ef82acbcc 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -16,9 +16,9 @@ declare hidden void @requires_all_inputs()
 define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-LABEL: parent_func_missing_inputs:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_mov_b32 s16, s33
 ; FIXEDABI-NEXT:    s_mov_b32 s33, s32
+; FIXEDABI-NEXT:    s_waitcnt expcnt(0)
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; FIXEDABI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FIXEDABI-NEXT:    s_mov_b64 exec, s[18:19]
@@ -96,7 +96,6 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 {
 define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-SDAG-LABEL: marked_func_use_workitem_id:
 ; FIXEDABI-SDAG:       ; %bb.0:
-; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -105,12 +104,11 @@ define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    v_bfe_u32 v2, v31, 20, 10
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FIXEDABI-GISEL-LABEL: marked_func_use_workitem_id:
 ; FIXEDABI-GISEL:       ; %bb.0:
-; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; FIXEDABI-GISEL-NEXT:    v_bfe_u32 v3, v31, 10, 10
 ; FIXEDABI-GISEL-NEXT:    v_bfe_u32 v4, v31, 20, 10
@@ -119,7 +117,7 @@ define void @marked_func_use_workitem_id(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-GISEL-NEXT:    flat_store_dword v[0:1], v3
 ; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    flat_store_dword v[0:1], v4
-; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -157,7 +155,6 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
 define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_func_use_workgroup_id:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s12
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
@@ -166,7 +163,7 @@ define void @marked_func_use_workgroup_id(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s14
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
   %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
   %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -206,7 +203,6 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_func_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s7
 ; FIXEDABI-NEXT:    flat_load_ubyte v2, v[2:3] glc
@@ -222,7 +218,7 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s11
 ; FIXEDABI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
   %queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
   %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -275,7 +271,6 @@ define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) #0 {
 ; FIXEDABI-SDAG-LABEL: addrspacecast_requires_queue_ptr:
 ; FIXEDABI-SDAG:       ; %bb.0:
-; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x40
 ; FIXEDABI-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -291,12 +286,11 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 ; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; FIXEDABI-SDAG-NEXT:    flat_store_dword v[4:5], v0
-; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; FIXEDABI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FIXEDABI-GISEL-LABEL: addrspacecast_requires_queue_ptr:
 ; FIXEDABI-GISEL:       ; %bb.0:
-; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x40
 ; FIXEDABI-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
 ; FIXEDABI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
@@ -312,7 +306,7 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 ; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    v_mov_b32_e32 v2, 2
 ; FIXEDABI-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; FIXEDABI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr
   %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr
@@ -324,13 +318,12 @@ define void @addrspacecast_requires_queue_ptr(ptr addrspace(5) %ptr.private, ptr
 define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
 ; FIXEDABI-LABEL: is_shared_requires_queue_ptr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_load_dword s4, s[6:7], 0x40
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
 ; FIXEDABI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
   %zext = zext i1 %is.shared to i32
@@ -341,13 +334,12 @@ define void @is_shared_requires_queue_ptr(ptr %ptr) #0 {
 define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
 ; FIXEDABI-LABEL: is_private_requires_queue_ptr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_load_dword s4, s[6:7], 0x44
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
 ; FIXEDABI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v0
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
+; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
   %is.private = call i1 @llvm.amdgcn.is.private(ptr %ptr)
   %zext = zext i1 %is.private to i32
@@ -358,7 +350,6 @@ define void @is_private_requires_queue_ptr(ptr %ptr) #0 {
 define void @trap_requires_queue() #0 {
 ; FIXEDABI-LABEL: trap_requires_queue:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; FIXEDABI-NEXT:    s_trap 2
   call void @llvm.trap()
@@ -368,7 +359,6 @@ define void @trap_requires_queue() #0 {
 define void @debugtrap_requires_queue() #0 {
 ; FIXEDABI-LABEL: debugtrap_requires_queue:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FIXEDABI-NEXT:    s_trap 3
   call void @llvm.debugtrap()
   unreachable
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
index 2d60faaf8c8c034..8a48872e6f13242 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.gfx6.ll
@@ -23,13 +23,13 @@ define ptr addrspace(1) @flat_to_gobal_addrspacecast(ptr %ptr) {
 define ptr @group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: group_to_flat_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   ret ptr %stof
@@ -38,9 +38,9 @@ define ptr @group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 define ptr addrspace(3) @flat_to_group_addrspacecast(ptr %ptr) {
 ; CHECK-LABEL: flat_to_group_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr %ptr to ptr addrspace(3)
   ret ptr addrspace(3) %stof
@@ -49,13 +49,13 @@ define ptr addrspace(3) @flat_to_group_addrspacecast(ptr %ptr) {
 define ptr @private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; CHECK-LABEL: private_to_flat_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_load_dword s4, s[6:7], 0x11
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, -1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   ret ptr %stof
@@ -64,9 +64,9 @@ define ptr @private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 define ptr addrspace(5) @flat_to_private_addrspacecast(ptr %ptr) {
 ; CHECK-LABEL: flat_to_private_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr %ptr to ptr addrspace(5)
   ret ptr addrspace(5) %stof
@@ -75,8 +75,8 @@ define ptr addrspace(5) @flat_to_private_addrspacecast(ptr %ptr) {
 define ptr addrspace(6) @constant_to_constant32_addrspacecast(ptr addrspace(4) %ptr, i32 %offset) {
 ; CHECK-LABEL: constant_to_constant32_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %addrspacecast = addrspacecast ptr addrspace(4) %ptr to ptr addrspace(6)
   %gep = getelementptr i8, ptr addrspace(6) %addrspacecast, i32 %offset
@@ -86,8 +86,8 @@ define ptr addrspace(6) @constant_to_constant32_addrspacecast(ptr addrspace(4) %
 define ptr addrspace(6) @global_to_constant32_addrspacecast(ptr addrspace(1) %ptr, i32 %offset) {
 ; CHECK-LABEL: global_to_constant32_addrspacecast:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %addrspacecast = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(6)
   %gep = getelementptr i8, ptr addrspace(6) %addrspacecast, i32 %offset
@@ -97,8 +97,8 @@ define ptr addrspace(6) @global_to_constant32_addrspacecast(ptr addrspace(1) %pt
 define ptr @constant32bit_to_flat_addrspacecast_0(ptr addrspace(6) %ptr) {
 ; CHECK-LABEL: constant32bit_to_flat_addrspacecast_0:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(6) %ptr to ptr
   ret ptr %stof
@@ -107,8 +107,8 @@ define ptr @constant32bit_to_flat_addrspacecast_0(ptr addrspace(6) %ptr) {
 define ptr @constant32bit_to_flat_addrspacecast_1(ptr addrspace(6) %ptr) #0 {
 ; CHECK-LABEL: constant32bit_to_flat_addrspacecast_1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(6) %ptr to ptr
   ret ptr %stof
@@ -117,9 +117,9 @@ define ptr @constant32bit_to_flat_addrspacecast_1(ptr addrspace(6) %ptr) #0 {
 define ptr @constant32bit_to_flat_addrspacecast_null() {
 ; CHECK-LABEL: constant32bit_to_flat_addrspacecast_null:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(6) null to ptr
   ret ptr %stof
@@ -128,8 +128,8 @@ define ptr @constant32bit_to_flat_addrspacecast_null() {
 define ptr @constant32bit_to_flat_addrspacecast_undef() #0 {
 ; SDAG-LABEL: constant32bit_to_flat_addrspacecast_undef:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: constant32bit_to_flat_addrspacecast_undef:
@@ -143,8 +143,8 @@ define ptr @constant32bit_to_flat_addrspacecast_undef() #0 {
 define ptr @constant32bit_to_flat_addrspacecast_poison() #0 {
 ; SDAG-LABEL: constant32bit_to_flat_addrspacecast_poison:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: constant32bit_to_flat_addrspacecast_poison:
@@ -158,9 +158,9 @@ define ptr @constant32bit_to_flat_addrspacecast_poison() #0 {
 define ptr @constant32bit_to_flat_addrspacecast_constant() #0 {
 ; CHECK-LABEL: constant32bit_to_flat_addrspacecast_constant:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3039
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %stof = addrspacecast ptr addrspace(6) inttoptr (i32 12345 to ptr addrspace(6)) to ptr
   ret ptr %stof
@@ -169,9 +169,9 @@ define ptr @constant32bit_to_flat_addrspacecast_constant() #0 {
 define ptr addrspace(1) @addrspacecast_flat_null_to_global() {
 ; CHECK-LABEL: addrspacecast_flat_null_to_global:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   ret ptr addrspace(1) addrspacecast (ptr null to ptr addrspace(1))
 }
@@ -179,8 +179,8 @@ define ptr addrspace(1) @addrspacecast_flat_null_to_global() {
 define ptr addrspace(3) @addrspacecast_flat_null_to_group() {
 ; CHECK-LABEL: addrspacecast_flat_null_to_group:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   ret ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3))
 }
@@ -188,8 +188,8 @@ define ptr addrspace(3) @addrspacecast_flat_null_to_group() {
 define ptr addrspace(5) @addrspacecast_flat_null_to_private() {
 ; CHECK-LABEL: addrspacecast_flat_null_to_private:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   ret ptr addrspace(5) addrspacecast (ptr null to ptr addrspace(5))
 }
@@ -197,8 +197,8 @@ define ptr addrspace(5) @addrspacecast_flat_null_to_private() {
 define ptr addrspace(6) @addrspacecast_flat_null_to_constant32bit() {
 ; CHECK-LABEL: addrspacecast_flat_null_to_constant32bit:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   ret ptr addrspace(6) addrspacecast (ptr null to ptr addrspace(6))
 }
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index bf5843ea8047d5f..0a1335ab630d527 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -7,7 +7,6 @@
 define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v33, v0
 ; GFX908-NEXT:    ;;#ASMSTART
@@ -154,11 +153,11 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use a3 v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: no_free_vgprs_at_agpr_to_agpr_copy:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -230,6 +229,7 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a3 v[0:31]
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
   %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
@@ -372,7 +372,6 @@ define amdgpu_kernel void @no_agpr_no_reserve(ptr addrspace(1) %arg) #0 {
 define void @v32_asm_def_use(float %v0, float %v1) #0 {
 ; GFX908-LABEL: v32_asm_def_use:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v33, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX908-NEXT:    ;;#ASMSTART
@@ -449,11 +448,11 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use v32
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v32_asm_def_use:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v34, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -497,6 +496,7 @@ define void @v32_asm_def_use(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use v32
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${a[0:15]}"()
   %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
@@ -913,7 +913,6 @@ bb58:                                             ; preds = %bb51, %bb16
 define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v33, v0
 ; GFX908-NEXT:    ;;#ASMSTART
@@ -1060,11 +1059,11 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use a3 v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: no_free_vgprs_at_sgpr_to_agpr_copy:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -1138,6 +1137,7 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ; use a3 v[0:31]
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v34 ; Reload Reuse
+; GFX90A-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"()
   %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
index d42fd6366c0bacb..4c01fcc09e4b07b 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll
@@ -28,7 +28,6 @@ define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 {
 define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) #1 {
 ; GFX908-LABEL: remat_regcopy_avoids_spill:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a4, v2
@@ -42,6 +41,7 @@ define void @remat_regcopy_avoids_spill(i32 %v0, i32 %v1, i32 %v2, i32 %v3, i32
 ; GFX908-NEXT:    v_accvgpr_write_b32 a4, v6
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ;;#ASMEND
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "a,a,a,a"(i32 %v0, i32 %v1, i32 %v2, i32 %v3)
   call void asm sideeffect "", "a,a,a,a,a"(i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8)
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index ed09c887d1ea5d5..3094f36201ad0c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -22,17 +22,14 @@ define amdgpu_kernel void @test0() {
 define void @test1() {
 ; GFX9-LABEL: test1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: test1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_endpgm
   tail call void @llvm.amdgcn.endpgm()
   unreachable
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 5c40a4ce13e31a1..77d4578a2e1e78e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -9,12 +9,12 @@ define i32 @select_sdiv_lhs_const_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_lhs_const_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x1e848
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x30d40
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = sdiv i32 1000000, %select
@@ -28,12 +28,12 @@ define i32 @select_sdiv_rhs_const_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_rhs_const_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x2710
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x3e8
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 42000, i32 420000
   %op = sdiv i32 %select, 42
@@ -47,13 +47,13 @@ define <2 x i32> @select_sdiv_lhs_const_v2i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_lhs_const_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x22b
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x29a
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x594
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 5, i32 undef>, <2 x i32> <i32 6, i32 7>
   %op = sdiv <2 x i32> <i32 3333, i32 9999>, %select
@@ -67,7 +67,6 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_rhs_const_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3661c
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x307dd
@@ -76,6 +75,7 @@ define <2 x i32> @select_sdiv_rhs_const_v2i32(i1 %cond) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x23b02a
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x13e3a0c
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, <2 x i32> <i32 8342123, i32 834212353>, <2 x i32> <i32 9355456, i32 93554321>
   %op = sdiv <2 x i32> %select, <i32 42, i32 40>
@@ -125,7 +125,6 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_lhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -160,6 +159,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 5
   %op = sdiv i32 1000000, %select
@@ -207,7 +207,6 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_lhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -242,6 +241,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 1000000, %select
@@ -256,7 +256,6 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_rhs_opaque_const0_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -272,6 +271,7 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 ptrtoint (ptr addrspace(1) @gv to i32), i32 234234
   %op = sdiv i32 %select, 42
@@ -286,7 +286,6 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_sdiv_rhs_opaque_const1_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, gv at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, gv at gotpcrel32@hi+12
@@ -302,6 +301,7 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 3, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 42000, i32 ptrtoint (ptr addrspace(1) @gv to i32)
   %op = sdiv i32 %select, 42
@@ -315,12 +315,12 @@ define i32 @select_add_lhs_const_i32(i1 %cond) {
 ;
 ; GCN-LABEL: select_add_lhs_const_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0xf4248
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0xf4245
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 5, i32 8
   %op = add i32 1000000, %select
@@ -334,12 +334,12 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
 ;
 ; GCN-LABEL: select_fadd_lhs_const_i32_fmf:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40a00000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 2.0, float 4.0
   %op = fadd nnan nsz float 1.0, %select
@@ -350,12 +350,12 @@ define float @select_fadd_lhs_const_i32_fmf(i1 %cond) {
 define i32 @select_mul_lhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_lhs_const_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_mul_lhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
@@ -369,12 +369,12 @@ define i32 @select_mul_lhs_const_i32(i1 %cond) {
 define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; GCN-LABEL: select_mul_rhs_const_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f40
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x1388
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
@@ -410,10 +410,10 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 define i16 @select_add_trunc_select(i1 %cond) {
 ; GCN-LABEL: select_add_trunc_select:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ; IR-LABEL: @select_add_trunc_select(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
@@ -431,10 +431,10 @@ define i32 @select_add_sext_select(i1 %cond) {
 ;
 ; GCN-LABEL: select_add_sext_select:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 -13, i16 8
   %trunc = sext i16 %select to i32
@@ -449,10 +449,10 @@ define i32 @select_add_zext_select(i1 %cond) {
 ;
 ; GCN-LABEL: select_add_zext_select:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 5, i16 8
   %trunc = zext i16 %select to i32
@@ -467,12 +467,12 @@ define i32 @select_add_bitcast_select(i1 %cond) {
 ;
 ; GCN-LABEL: select_add_bitcast_select:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000002a
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x3f80002a
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, float 1.0, float 2.0
   %trunc = bitcast float %select to i32
@@ -492,7 +492,6 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
 ;
 ; GCN-LABEL: multi_use_cast_regression:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x3c00
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -500,6 +499,7 @@ define <2 x half> @multi_use_cast_regression(i1 %cond) {
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_sub_f32_e32 v1, 1.0, v0
 ; GCN-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, half 1.000000e+00, half 0.000000e+00
   %fpext = fpext half %select to float
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 3e572f95ab9a5ae..aa8d2c8be046270 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -9,22 +9,18 @@ declare amdgpu_gfx void @use(...)
 define amdgpu_cs_chain void @amdgpu_cs_chain_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_no_stack:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_no_stack:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_no_stack:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_no_stack:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   ret void
 }
@@ -32,7 +28,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_no_stack({ptr, i32, <4 x i32>} inre
 define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr, <4 x i32> %vgpr) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -45,7 +40,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_simple_call:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v4, v8
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v5, v9
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v6, v10
@@ -64,7 +58,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_simple_call:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -77,7 +70,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_simple_call:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v7, v11
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v6, v10
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v5, v9
@@ -100,7 +92,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24 x i32> %vgprs) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_mov_b32 s32, 0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
 ; GISEL-GFX11-NEXT:    s_add_u32 s24, s32, 4
@@ -161,7 +152,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_spill:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v32, v8
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v33, v9
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v34, v10
@@ -228,7 +218,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
 ; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 60
@@ -289,7 +278,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_spill:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v32, v15
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v33, v14
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v34, v13
@@ -360,7 +348,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 define amdgpu_cs_chain void @alloca_and_call() {
 ; GISEL-GFX11-LABEL: alloca_and_call:
 ; GISEL-GFX11:       ; %bb.0: ; %.entry
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; GISEL-GFX11-NEXT:    s_mov_b32 s0, use at abs32@lo
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, use at abs32@hi
@@ -372,7 +359,6 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ;
 ; GISEL-GFX10-LABEL: alloca_and_call:
 ; GISEL-GFX10:       ; %bb.0: ; %.entry
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; GISEL-GFX10-NEXT:    s_mov_b32 s4, use at abs32@lo
@@ -386,7 +372,6 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ;
 ; DAGISEL-GFX11-LABEL: alloca_and_call:
 ; DAGISEL-GFX11:       ; %bb.0: ; %.entry
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v0, 42
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s1, use at abs32@hi
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s0, use at abs32@lo
@@ -398,7 +383,6 @@ define amdgpu_cs_chain void @alloca_and_call() {
 ;
 ; DAGISEL-GFX10-LABEL: alloca_and_call:
 ; DAGISEL-GFX10:       ; %bb.0: ; %.entry
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; DAGISEL-GFX10-NEXT:    s_mov_b64 s[0:1], s[48:49]
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s5, use at abs32@hi
@@ -504,7 +488,6 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -519,7 +502,6 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -534,7 +516,6 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -549,7 +530,6 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -569,9 +549,9 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain_wwm:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
+; GISEL-GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -589,9 +569,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_wwm:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; GISEL-GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -608,9 +588,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_wwm:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
+; DAGISEL-GFX11-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -628,9 +608,9 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_wwm:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; DAGISEL-GFX10-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -655,7 +635,6 @@ define amdgpu_cs_chain void @chain_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b
 define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -670,7 +649,6 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -685,7 +663,6 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -700,7 +677,6 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -720,7 +696,6 @@ define amdgpu_cs_chain void @chain_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3
 define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain_fewer_args:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -735,7 +710,6 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -750,7 +724,6 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -765,7 +738,6 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -787,7 +759,6 @@ define amdgpu_cs_chain void @chain_to_chain_fewer_args(<3 x i32> inreg %a, <3 x
 define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain_more_args:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -803,7 +774,6 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_more_args:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -820,7 +790,6 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_more_args:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -836,7 +805,6 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_more_args:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -860,7 +828,6 @@ define amdgpu_cs_chain void @chain_to_chain_more_args(<3 x i32> inreg %a, <3 x i
 define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, 4
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, 3
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, 2
@@ -876,7 +843,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
@@ -895,7 +861,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_dont_realign_stack:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
 ; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
@@ -905,7 +870,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_dont_realign_stack(i32 %idx) {
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_dont_realign_stack:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
index 4190d073e2561a8..087d87c33979505 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll
@@ -7,22 +7,18 @@
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_no_stack({ptr, i32, <4 x i32>} inreg %a, {ptr, i32, <4 x i32>} %b) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_no_stack:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_endpgm
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_no_stack:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_no_stack:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_endpgm
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_no_stack:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_endpgm
   ret void
 }
@@ -115,7 +111,6 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_to_chain_preserve:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -130,7 +125,6 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ;
 ; GISEL-GFX10-LABEL: chain_to_chain_preserve:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -145,7 +139,6 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ;
 ; DAGISEL-GFX11-LABEL: chain_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    ;;#ASMSTART
@@ -160,7 +153,6 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 ;
 ; DAGISEL-GFX10-LABEL: chain_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    ;;#ASMSTART
@@ -180,7 +172,6 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %a, <3 x i3
 define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
@@ -197,7 +188,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
@@ -214,7 +204,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
@@ -231,7 +220,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
@@ -253,7 +241,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve(<3 x i32>
 define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
@@ -270,7 +257,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
@@ -287,7 +273,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
@@ -304,7 +289,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
@@ -326,10 +310,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain(<3 x i32> inreg %a
 define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
+; GISEL-GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; GISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -348,10 +332,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; GISEL-GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; GISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -369,10 +353,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 3
+; DAGISEL-GFX11-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; DAGISEL-GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -391,10 +375,10 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_wwm:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s3, s0
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; DAGISEL-GFX10-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 4
 ; DAGISEL-GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -420,7 +404,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_wwm(<3 x i32> inre
 define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_clause 0x1
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
@@ -441,7 +424,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
@@ -461,7 +443,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_clause 0x1
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v11, off offset:8
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4
@@ -482,7 +463,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_use_all_v0_v7:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v11, off, s[48:51], 0 offset:8 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v11, v8
@@ -507,7 +487,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_use_all_v0_v7(<3 x
 define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_args(<3 x i32> inreg %a, <3 x i32> %b) {
 ; GISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
@@ -524,7 +503,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ;
 ; GISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; GISEL-GFX10-NEXT:    s_mov_b32 s2, s0
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
@@ -541,7 +519,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ;
 ; DAGISEL-GFX11-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, off offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX11-NEXT:    s_mov_b32 s2, s0
@@ -558,7 +535,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 ;
 ; DAGISEL-GFX10-LABEL: chain_preserve_to_chain_preserve_fewer_args:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    buffer_store_dword v16, off, s[48:51], 0 offset:4 ; 4-byte Folded Spill
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; DAGISEL-GFX10-NEXT:    s_mov_b32 s2, s0
@@ -585,7 +561,6 @@ define amdgpu_cs_chain_preserve void @chain_preserve_to_chain_preserve_fewer_arg
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stack(i32 %idx) {
 ; GISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_mov_b32 s3, 4
 ; GISEL-GFX11-NEXT:    s_mov_b32 s2, 3
 ; GISEL-GFX11-NEXT:    s_mov_b32 s1, 2
@@ -601,7 +576,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ;
 ; GISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack:
 ; GISEL-GFX10:       ; %bb.0:
-; GISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v8
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 2
@@ -620,7 +594,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ;
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack:
 ; DAGISEL-GFX11:       ; %bb.0:
-; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
 ; DAGISEL-GFX11-NEXT:    v_lshl_add_u32 v4, v8, 4, 32
@@ -630,7 +603,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_dont_realign_stac
 ;
 ; DAGISEL-GFX10-LABEL: amdgpu_cs_chain_preserve_dont_realign_stack:
 ; DAGISEL-GFX10:       ; %bb.0:
-; DAGISEL-GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; DAGISEL-GFX10-NEXT:    v_lshl_add_u32 v1, v8, 4, 32
 ; DAGISEL-GFX10-NEXT:    v_mov_b32_e32 v2, 3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
index 9f5b6389ab59f5d..33d91deafa30b27 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-mul24-knownbits.ll
@@ -32,11 +32,11 @@ entry:
 define i32 @f(i32 %x, i32 %y) {
 ; GCN-LABEL: f:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v0, 0xffff80, v0
 ; GCN-NEXT:    v_or_b32_e32 v1, 0xffff80, v1
 ; GCN-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 14, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %xx = or i32 %x, -128 ; 0xffffff80
   %yy = or i32 %y, -128 ; 0xffffff80
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 8c64ab5952def75..2765cddb2d5f996 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -20,7 +20,6 @@ declare hidden half @_Z4pownDhi(half, i32)
 define half @test_pow_fast_f16(half %x, half %y) {
 ; CHECK-LABEL: test_pow_fast_f16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powDhDh at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powDhDh at rel32@hi+12
@@ -32,7 +31,6 @@ define half @test_pow_fast_f16(half %x, half %y) {
 define float @test_pow_fast_f32(float %x, float %y) {
 ; CHECK-LABEL: test_pow_fast_f32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powff at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powff at rel32@hi+12
@@ -44,7 +42,6 @@ define float @test_pow_fast_f32(float %x, float %y) {
 define double @test_pow_fast_f64(double %x, double %y) {
 ; CHECK-LABEL: test_pow_fast_f64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[16:17]
 ; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powdd at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powdd at rel32@hi+12
@@ -56,7 +53,6 @@ define double @test_pow_fast_f64(double %x, double %y) {
 define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-LABEL: test_pow_fast_f16__integral_y:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -69,6 +65,7 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 ; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
 ; CHECK-NEXT:    v_exp_f16_e32 v2, v2
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to half
   %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
@@ -78,7 +75,6 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
 define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-LABEL: test_pow_fast_f32__integral_y:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -103,6 +99,7 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to float
   %pow = tail call fast float @_Z3powff(float %x, float %y)
@@ -112,9 +109,9 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
 define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-LABEL: test_pow_fast_f64__integral_y:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
@@ -213,10 +210,10 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 define half @test_powr_fast_f16(half %x, half %y) {
 ; CHECK-LABEL: test_powr_fast_f16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_log_f16_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
   ret half %powr
@@ -225,7 +222,6 @@ define half @test_powr_fast_f16(half %x, half %y) {
 define float @test_powr_fast_f32(float %x, float %y) {
 ; CHECK-LABEL: test_powr_fast_f32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
@@ -245,6 +241,7 @@ define float @test_powr_fast_f32(float %x, float %y) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast float @_Z4powrff(float %x, float %y)
   ret float %powr
@@ -253,9 +250,9 @@ define float @test_powr_fast_f32(float %x, float %y) {
 define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-LABEL: test_powr_fast_f64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
@@ -348,7 +345,6 @@ define double @test_powr_fast_f64(double %x, double %y) {
 define half @test_pown_fast_f16(half %x, i32 %y) {
 ; CHECK-LABEL: test_pown_fast_f16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
 ; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
 ; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
@@ -357,6 +353,7 @@ define half @test_pown_fast_f16(half %x, i32 %y) {
 ; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
 ; CHECK-NEXT:    v_exp_f16_e32 v2, v2
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
@@ -365,7 +362,6 @@ define half @test_pown_fast_f16(half %x, i32 %y) {
 define float @test_pown_fast_f32(float %x, i32 %y) {
 ; CHECK-LABEL: test_pown_fast_f32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
@@ -388,6 +384,7 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
@@ -396,9 +393,9 @@ define float @test_pown_fast_f32(float %x, i32 %y) {
 define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-LABEL: test_pown_fast_f64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
@@ -492,13 +489,13 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f16_known_even:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v0, |v0|
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -508,7 +505,6 @@ define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
 define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f32_known_even:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
@@ -530,6 +526,7 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
@@ -539,9 +536,9 @@ define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
 define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f64_known_even:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
@@ -630,7 +627,6 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f16_known_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
@@ -639,6 +635,7 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
 ; CHECK-NEXT:    v_exp_f16_e32 v1, v1
 ; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
@@ -648,7 +645,6 @@ define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
 define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f32_known_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x800000
 ; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
@@ -672,6 +668,7 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_brev_b32 s4, 1
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
@@ -681,9 +678,9 @@ define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
 define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-LABEL: test_pown_fast_f64_known_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 92b70ad797b88ae..d017c61497183d8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -7,7 +7,6 @@
 define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_system:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
@@ -21,6 +20,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX908-NEXT:    buffer_wbinvl1_vol
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_waitcnt expcnt(0)
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -30,7 +30,6 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ;
 ; GFX90A-LABEL: syncscope_system:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
@@ -46,6 +45,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -55,17 +55,16 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ;
 ; GFX940-LABEL: syncscope_system:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: syncscope_system:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:  .LBB0_1: ; %atomicrmw.start
@@ -82,7 +81,7 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX1100-NEXT:    buffer_gl1_inv
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1100-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX1100-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX1100-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -96,7 +95,6 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
 define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_workgroup_rtn:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
@@ -109,6 +107,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_waitcnt expcnt(0)
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -118,10 +117,10 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ;
 ; GFX90A-LABEL: syncscope_workgroup_rtn:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr3
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB1_6
@@ -168,18 +167,19 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 ;
 ; GFX940-LABEL: syncscope_workgroup_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: syncscope_workgroup_rtn:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 glc
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    buffer_gl0_inv
+; GFX1100-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret float %res
@@ -188,9 +188,9 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_workgroup_nortn:
 ; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT:    s_waitcnt expcnt(0)
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
@@ -199,7 +199,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_8
 ; GFX908-NEXT:  .LBB2_2: ; %atomicrmw.phi
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ; GFX908-NEXT:  .LBB2_3: ; %atomicrmw.check.private
 ; GFX908-NEXT:    s_mov_b64 s[6:7], src_private_base
@@ -240,9 +240,9 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ;
 ; GFX90A-LABEL: syncscope_workgroup_nortn:
 ; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
@@ -251,7 +251,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_8
 ; GFX90A-NEXT:  .LBB2_2: ; %atomicrmw.phi
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.check.private
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], src_private_base
@@ -292,19 +292,20 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ;
 ; GFX940-LABEL: syncscope_workgroup_nortn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: syncscope_workgroup_nortn:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1100-NEXT:    buffer_gl0_inv
+; GFX1100-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
   ret void
@@ -313,7 +314,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 define float @no_unsafe(ptr %addr, float %val) {
 ; GFX908-LABEL: no_unsafe:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
@@ -326,6 +326,7 @@ define float @no_unsafe(ptr %addr, float %val) {
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_waitcnt expcnt(0)
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -335,7 +336,6 @@ define float @no_unsafe(ptr %addr, float %val) {
 ;
 ; GFX90A-LABEL: no_unsafe:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
@@ -348,6 +348,7 @@ define float @no_unsafe(ptr %addr, float %val) {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -357,14 +358,13 @@ define float @no_unsafe(ptr %addr, float %val) {
 ;
 ; GFX940-LABEL: no_unsafe:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: no_unsafe:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    flat_load_b32 v3, v[0:1]
 ; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:  .LBB3_1: ; %atomicrmw.start
@@ -380,7 +380,7 @@ define float @no_unsafe(ptr %addr, float %val) {
 ; GFX1100-NEXT:    buffer_gl0_inv
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
 ; GFX1100-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1100-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX1100-NEXT:    s_cbranch_execnz .LBB3_1
 ; GFX1100-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
index 5e1f9b0a8df92b9..2674830dad7a752 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll
@@ -4,7 +4,6 @@
 define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_lds:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v1, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB0_1: ; %atomicrmw.start
@@ -18,11 +17,13 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_1
 ; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw nand ptr addrspace(3) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -31,7 +32,6 @@ define i32 @atomic_nand_i32_lds(ptr addrspace(3) %ptr) nounwind {
 define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_global:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v2, v[0:1], off
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB1_1: ; %atomicrmw.start
@@ -46,6 +46,7 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
 ; GCN-NEXT:    buffer_wbinvl1_vol
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_1
 ; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -59,7 +60,6 @@ define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
 define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
 ; GCN-LABEL: atomic_nand_i32_flat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_load_dword v2, v[0:1]
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB2_1: ; %atomicrmw.start
@@ -74,6 +74,7 @@ define i32 @atomic_nand_i32_flat(ptr %ptr) nounwind {
 ; GCN-NEXT:    buffer_wbinvl1_vol
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB2_1
 ; GCN-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index fc8a4c0ac0109b6..d729b1c58288c59 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -12,9 +12,8 @@
 define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 ; GFX9-NO-BACKOFF-LABEL: back_off_barrier_no_fence:
 ; GFX9-NO-BACKOFF:       ; %bb.0:
-; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
-; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NO-BACKOFF-NEXT:    s_barrier
 ; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -32,19 +31,18 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 ;
 ; GFX10-BACKOFF-LABEL: back_off_barrier_no_fence:
 ; GFX10-BACKOFF:       ; %bb.0:
-; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX10-BACKOFF-NEXT:    s_barrier
 ; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
-; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-BACKOFF-LABEL: back_off_barrier_no_fence:
 ; GFX11-BACKOFF:       ; %bb.0:
-; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-BACKOFF-NEXT:    s_barrier
 ; GFX11-BACKOFF-NEXT:    flat_store_b32 v[2:3], v0
 ; GFX11-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
@@ -58,9 +56,8 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 {
 ; GFX9-NO-BACKOFF-LABEL: back_off_barrier_with_fence:
 ; GFX9-NO-BACKOFF:       ; %bb.0:
-; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NO-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
-; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NO-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NO-BACKOFF-NEXT:    s_barrier
 ; GFX9-NO-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
@@ -80,7 +77,6 @@ define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 {
 ;
 ; GFX10-BACKOFF-LABEL: back_off_barrier_with_fence:
 ; GFX10-BACKOFF:       ; %bb.0:
-; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX10-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -89,14 +85,13 @@ define void @back_off_barrier_with_fence(ptr %in, ptr %out) #0 {
 ; GFX10-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-BACKOFF-NEXT:    buffer_gl0_inv
 ; GFX10-BACKOFF-NEXT:    flat_store_dword v[2:3], v0
-; GFX10-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-BACKOFF-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-BACKOFF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-BACKOFF-LABEL: back_off_barrier_with_fence:
 ; GFX11-BACKOFF:       ; %bb.0:
-; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-BACKOFF-NEXT:    s_barrier
 ; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0a81e98005b1adf..3f65d3b83da638e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -12,7 +12,6 @@
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -20,12 +19,11 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -33,33 +31,31 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   store bfloat %val, ptr addrspace(1) %out
@@ -69,7 +65,6 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_f32_to_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -78,12 +73,11 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_f32_to_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -92,34 +86,32 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_f32_to_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_f32_to_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_f32_to_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load float, ptr addrspace(1) %in
   %val.bf16 = fptrunc float %val to bfloat
@@ -130,7 +122,6 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_f64_to_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -140,12 +131,11 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_f64_to_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -155,37 +145,35 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_f64_to_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_f64_to_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_f64_to_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load double, ptr addrspace(1) %in
   %val.bf16 = fptrunc double %val to bfloat
@@ -196,7 +184,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_bf16_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -205,12 +192,11 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_bf16_to_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -219,36 +205,34 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_bf16_to_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_bf16_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[2:3], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_bf16_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[2:3], v4, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f32 = fpext bfloat %val to float
@@ -259,7 +243,6 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
 define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_bf16_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -269,12 +252,11 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_bf16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -284,39 +266,37 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_bf16_to_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_bf16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_bf16_to_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val.f64 = fpext bfloat %val to double
@@ -327,7 +307,6 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
 define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -335,12 +314,11 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_v2bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -348,33 +326,31 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_v2bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_v2bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_v2bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x bfloat>, ptr addrspace(1) %in
   store <2 x bfloat> %val, ptr addrspace(1) %out
@@ -384,7 +360,6 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -392,12 +367,11 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_v4bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -405,33 +379,31 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x bfloat>, ptr addrspace(1) %in
   store <4 x bfloat> %val, ptr addrspace(1) %out
@@ -441,7 +413,6 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_v8bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -449,12 +420,11 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_v8bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -462,33 +432,31 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_v8bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_v8bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_v8bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x bfloat>, ptr addrspace(1) %in
   store <8 x bfloat> %val, ptr addrspace(1) %out
@@ -498,7 +466,6 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store_v16bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -509,12 +476,11 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 ; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_load_store_v16bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -525,12 +491,11 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 ; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_load_store_v16bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -541,24 +506,22 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_load_store_v16bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_load_store_v16bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
 ; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
@@ -566,6 +529,7 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 ; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load <16 x bfloat>, ptr addrspace(1) %in
   store <16 x bfloat> %val, ptr addrspace(1) %out
@@ -575,47 +539,43 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out
 define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -624,7 +584,6 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
 define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store_v2bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
@@ -632,12 +591,11 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store_v2bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
@@ -645,27 +603,25 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store_v2bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store_v2bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v2bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[1:2], v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <2 x bfloat> %in, ptr addrspace(1) %out
   ret void
@@ -674,7 +630,6 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
 define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
 ; GCN-LABEL: test_arg_store_v3bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
@@ -684,12 +639,11 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
 ; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store_v3bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
@@ -699,32 +653,30 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
 ; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store_v3bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_store_short v[2:3], v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store_v3bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v3bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
   ret void
@@ -733,7 +685,6 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store_v4bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GCN-NEXT:    s_mov_b32 s6, 0
@@ -743,12 +694,11 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store_v4bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -758,27 +708,25 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store_v4bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store_v4bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v4bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <4 x bfloat> %in, ptr addrspace(1)  %out
   ret void
@@ -787,7 +735,6 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store_v8bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -801,12 +748,11 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
 ; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
 ; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store_v8bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
@@ -820,27 +766,25 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
 ; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store_v8bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store_v8bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v8bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <8 x bfloat> %in, ptr addrspace(1) %out
   ret void
@@ -849,7 +793,6 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store_v16bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -872,12 +815,11 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
 ; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
 ; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_arg_store_v16bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -900,33 +842,31 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
 ; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_arg_store_v16bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
 ; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_arg_store_v16bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_arg_store_v16bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store <16 x bfloat> %in, ptr addrspace(1) %out
   ret void
@@ -935,7 +875,6 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_inreg_arg_store:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s34, s4, 16
 ; GCN-NEXT:    s_mov_b32 s38, 0
 ; GCN-NEXT:    s_mov_b32 s39, 0xf000
@@ -943,12 +882,11 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GCN-NEXT:    s_mov_b32 s37, s38
 ; GCN-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_inreg_arg_store:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_lshr_b32 s34, s4, 16
 ; GFX7-NEXT:    s_mov_b32 s38, 0
 ; GFX7-NEXT:    s_mov_b32 s39, 0xf000
@@ -956,31 +894,29 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s37, s38
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_inreg_arg_store:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_inreg_arg_store:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_inreg_arg_store:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
   ret void
@@ -989,39 +925,35 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
 ; GCN-LABEL: test_byval:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_byval:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_byval:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_byval:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_byval:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %bv
   %retval = load bfloat, ptr addrspace(5) %bv
@@ -1031,39 +963,35 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
 define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 ; GCN-LABEL: test_sret:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_sret:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_sret:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_sret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %val, ptr addrspace(5) %sret
   ret void
@@ -1072,7 +1000,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
 define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_bitcast_from_bfloat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -1080,12 +1007,11 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 ; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_bitcast_from_bfloat:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -1093,33 +1019,31 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_bitcast_from_bfloat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_bitcast_from_bfloat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_bitcast_from_bfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) %in
   %val_int = bitcast bfloat %val to i16
@@ -1130,7 +1054,6 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
 define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GCN-LABEL: test_bitcast_to_bfloat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -1138,12 +1061,11 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_bitcast_to_bfloat:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -1151,33 +1073,31 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_bitcast_to_bfloat:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_bitcast_to_bfloat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[0:1], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_bitcast_to_bfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to bfloat
@@ -1256,20 +1176,20 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
 ;
 ; GFX8-LABEL: test_ret_v3bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ret_v3bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_ret_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <3 x bfloat> %in
@@ -1365,9 +1285,9 @@ entry:
 define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1396,9 +1316,9 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1426,9 +1346,9 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1456,9 +1376,9 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1485,9 +1405,9 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1522,9 +1442,9 @@ entry:
 define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v2bf16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1557,9 +1477,9 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call_v2bf16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1591,9 +1511,9 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call_v2bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1620,9 +1540,9 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1649,9 +1569,9 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1686,9 +1606,9 @@ entry:
 define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v3bf16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1722,9 +1642,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call_v3bf16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1757,9 +1677,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call_v3bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1790,9 +1710,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call_v3bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1822,9 +1742,9 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call_v3bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1862,9 +1782,9 @@ entry:
 define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v4bf16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1905,9 +1825,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call_v4bf16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1947,9 +1867,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call_v4bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1987,9 +1907,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call_v4bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2022,9 +1942,9 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call_v4bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2065,9 +1985,9 @@ entry:
 define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v8bf16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2124,9 +2044,9 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call_v8bf16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2182,9 +2102,9 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call_v8bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2236,9 +2156,9 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call_v8bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2279,9 +2199,9 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call_v8bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2330,9 +2250,9 @@ entry:
 define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-LABEL: test_call_v16bf16:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2421,9 +2341,9 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX7-LABEL: test_call_v16bf16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2511,9 +2431,9 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX8-LABEL: test_call_v16bf16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2593,9 +2513,9 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX9-LABEL: test_call_v16bf16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -2652,9 +2572,9 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ;
 ; GFX10-LABEL: test_call_v16bf16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2719,56 +2639,55 @@ entry:
 define bfloat @test_alloca_load_store_ret(bfloat %in) {
 ; GCN-LABEL: test_alloca_load_store_ret:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_alloca_load_store_ret:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_alloca_load_store_ret:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_alloca_load_store_ret:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_alloca_load_store_ret:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %in.addr = alloca bfloat, align 2, addrspace(5)
@@ -2780,7 +2699,6 @@ entry:
 define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GCN-LABEL: test_overflow_stack:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -2868,12 +2786,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: test_overflow_stack:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
@@ -2946,12 +2863,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
 ; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_overflow_stack:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
 ; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
@@ -3024,12 +2940,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
 ; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_overflow_stack:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
 ; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
 ; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
@@ -3071,12 +2986,11 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
 ; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_overflow_stack:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
@@ -3117,6 +3031,7 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
 ; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
   %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 6ac3511ba50206b..075b0057d840328 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -92,32 +92,32 @@ entry:
 define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
 ; GFX7-LABEL: v_bfi_def_i32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bfi_def_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bfi_def_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bfi_def_i32:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bfi_def_i32:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = xor i32 %x, -1
@@ -212,32 +212,32 @@ entry:
 define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
 ; GFX7-LABEL: v_bfi_sha256_ch:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bfi_sha256_ch:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bfi_sha256_ch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = xor i32 %y, %z
@@ -544,37 +544,37 @@ entry:
 define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
 ; GFX7-LABEL: v_bfi_sha256_ma:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bfi_sha256_ma:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bfi_sha256_ma:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = and i32 %x, %z
@@ -587,37 +587,37 @@ entry:
 define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
 ; GFX7-LABEL: v_bitselect_v2i32_pat1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bitselect_v2i32_pat1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bitselect_v2i32_pat1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %xor.0 = xor <2 x i32> %a, %mask
   %and = and <2 x i32> %xor.0, %b
@@ -628,37 +628,37 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %
 define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-LABEL: v_bitselect_i64_pat_0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v1, v1, v3, v5
 ; GFX7-NEXT:    v_bfi_b32 v0, v0, v2, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bitselect_i64_pat_0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v1, v1, v3, v5
 ; GFX8-NEXT:    v_bfi_b32 v0, v0, v2, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bitselect_i64_pat_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_bfi_b32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v4
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v2, v4
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v3, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and0 = and i64 %a, %b
   %not.a = xor i64 %a, -1
@@ -930,37 +930,37 @@ define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i6
 define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-LABEL: v_bitselect_i64_pat_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
 ; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bitselect_i64_pat_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
 ; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bitselect_i64_pat_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
@@ -1104,37 +1104,37 @@ define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i6
 define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-LABEL: v_bitselect_i64_pat_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfi_b32 v1, v3, v1, v5
 ; GFX7-NEXT:    v_bfi_b32 v0, v2, v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bitselect_i64_pat_2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_bfi_b32 v1, v3, v1, v5
 ; GFX8-NEXT:    v_bfi_b32 v0, v2, v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bitselect_i64_pat_2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v2, v0, v4
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v3, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %xor.0 = xor i64 %a, %mask
   %and = and i64 %xor.0, %b
@@ -1145,47 +1145,47 @@ define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
 define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
 ; GFX7-LABEL: v_bfi_sha256_ma_i64:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_bfi_b32 v1, v1, v5, v3
 ; GFX7-NEXT:    v_bfi_b32 v0, v0, v4, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_bfi_sha256_ma_i64:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_bfi_b32 v1, v1, v5, v3
 ; GFX8-NEXT:    v_bfi_b32 v0, v0, v4, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_bfi_sha256_ma_i64:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfi_b32 v0, v0, v4, v2
 ; GFX10-NEXT:    v_bfi_b32 v1, v1, v5, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v0, v0, v4, v2
 ; GFX8-GISEL-NEXT:    v_bfi_b32 v1, v1, v5, v3
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v0, v0, v4, v2
 ; GFX10-GISEL-NEXT:    v_bfi_b32 v1, v1, v5, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %and0 = and i64 %x, %z
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
index 3383b46e7dd8f7d..36e508d1dd51d64 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_nested.ll
@@ -4,7 +4,6 @@
 define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_single_nesting_level:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -17,6 +16,7 @@ define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -38,7 +38,6 @@ define float @v_bfi_single_nesting_level(float %x, float %y, float %z) {
 define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_single_nesting_level_swapped_operands:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -51,6 +50,7 @@ define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, fl
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -72,7 +72,6 @@ define float @v_bfi_single_nesting_level_swapped_operands(float %x, float %y, fl
 define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_single_nesting_level_unbalanced_subtree:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -87,6 +86,7 @@ define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y,
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xc000001f, v2
 ; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -110,7 +110,6 @@ define float @v_bfi_single_nesting_level_unbalanced_subtree(float %x, float %y,
 define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_single_nesting_level_inner_use:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -119,6 +118,7 @@ define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x400003ff, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -141,7 +141,6 @@ define float @v_bfi_single_nesting_level_inner_use(float %x, float %y, float %z)
 define float @v_bfi_no_nesting(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_no_nesting:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -154,6 +153,7 @@ define float @v_bfi_no_nesting(float %x, float %y, float %z) {
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
@@ -175,7 +175,6 @@ define float @v_bfi_no_nesting(float %x, float %y, float %z) {
 define float @v_bfi_two_levels(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_two_levels:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -190,6 +189,7 @@ define float @v_bfi_two_levels(float %x, float %y, float %z) {
 ; GCN-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %y.i32 = fptoui float %y to i32
@@ -213,7 +213,6 @@ define float @v_bfi_two_levels(float %x, float %y, float %z) {
 define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_two_levels_inner_or_multiple_uses:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -229,6 +228,7 @@ define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff00000, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %y.i32 = fptoui float %y to i32
@@ -254,7 +254,6 @@ define float @v_bfi_two_levels_inner_or_multiple_uses(float %x, float %y, float
 define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) {
 ; GCN-LABEL: v_bfi_single_constant_as_partition:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x447fc000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x447fc000, v0
@@ -264,6 +263,7 @@ define float @v_bfi_single_constant_as_partition(float %x, float %y, float %z) {
 ; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul.base = fmul reassoc nnan nsz arcp contract afn float %z, 1.023000e+03
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 789a64220e62976..496b1f4d3bf6c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -66,22 +66,20 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #
 define void @v_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
 ; SI-LABEL: v_bfm_pattern:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_bfm_b32_e32 v2, v2, v3
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bfm_pattern:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfm_b32 v2, v2, v3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
@@ -93,22 +91,20 @@ define void @v_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) #0 {
 define void @v_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #0 {
 ; SI-LABEL: v_bfm_pattern_simple:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_bfm_b32_e64 v2, v2, 0
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bfm_pattern_simple:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfm_b32 v2, v2, 0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 64555f14a55cc1c..1ce6337948b75a8 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -875,42 +875,42 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs
 define float @missing_truncate_promote_bitreverse(i32 %arg) {
 ; SI-LABEL: missing_truncate_promote_bitreverse:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_bfrev_b32_e32 v0, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLAT-LABEL: missing_truncate_promote_bitreverse:
 ; FLAT:       ; %bb.0: ; %bb
-; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; FLAT-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLAT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: missing_truncate_promote_bitreverse:
 ; GISEL:       ; %bb.0: ; %bb
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse:
 ; GFX11-FLAT:       ; %bb.0: ; %bb
-; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FLAT-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLAT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 6201d7341898f58..ea8531fca717277 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -897,7 +897,7 @@ bb3:
 define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-LABEL: spill_func:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -1705,7 +1705,7 @@ define void @spill_func(ptr addrspace(1) %arg) #0 {
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cnd = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index e4c7df385d86197..0202bfb6f703fc9 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -487,29 +487,29 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace
 define float @missing_truncate_promote_bswap(i32 %arg) {
 ; SI-LABEL: missing_truncate_promote_bswap:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: missing_truncate_promote_bswap:
 ; VI:       ; %bb.0: ; %bb
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: missing_truncate_promote_bswap:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
@@ -522,25 +522,25 @@ bb:
 define i16 @v_bswap_i16(i16 %src) {
 ; SI-LABEL: v_bswap_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   ret i16 %bswap
@@ -549,25 +549,25 @@ define i16 @v_bswap_i16(i16 %src) {
 define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 ; SI-LABEL: v_bswap_i16_zext_to_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_i16_zext_to_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_i16_zext_to_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   %zext = zext i16 %bswap to i32
@@ -577,28 +577,28 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
 define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 ; SI-LABEL: v_bswap_i16_sext_to_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v1, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_i16_sext_to_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0xc0c0001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_i16_sext_to_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0xc0c0001
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call i16 @llvm.bswap.i16(i16 %src)
   %zext = sext i16 %bswap to i32
@@ -608,7 +608,6 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
 define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; SI-LABEL: v_bswap_v2i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -618,19 +617,20 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 ; SI-NEXT:    v_bfi_b32 v1, s4, v1, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x2030001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
   ret <2 x i16> %bswap
@@ -639,7 +639,6 @@ define <2 x i16> @v_bswap_v2i16(<2 x i16> %src) {
 define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 ; SI-LABEL: v_bswap_v3i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v3, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -654,21 +653,22 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 ; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_v3i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x2030001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
 ; GFX11-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <3 x i16> @llvm.bswap.v3i16(<3 x i16> %src)
   ret <3 x i16> %bswap
@@ -677,7 +677,6 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
 define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 ; SI-LABEL: v_bswap_v4i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v4, v2, v2, 8
 ; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -697,21 +696,22 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 ; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x2030001
 ; VI-NEXT:    v_perm_b32 v0, 0, v0, s4
 ; VI-NEXT:    v_perm_b32 v1, 0, v1, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v0, 0x2030001
 ; GFX11-NEXT:    v_perm_b32 v1, 0, v1, 0x2030001
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bswap = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %src)
   ret <4 x i16> %bswap
@@ -720,7 +720,6 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
 define i64 @v_bswap_i48(i64 %src) {
 ; SI-LABEL: v_bswap_i48:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    s_mov_b32 s4, 0xff00ff
@@ -730,26 +729,27 @@ define i64 @v_bswap_i48(i64 %src) {
 ; SI-NEXT:    v_bfi_b32 v0, s4, v1, v3
 ; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_bswap_i48:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x10203
 ; VI-NEXT:    v_perm_b32 v2, 0, v0, s4
 ; VI-NEXT:    v_perm_b32 v0, 0, v1, s4
 ; VI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_bswap_i48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v2, 0, v0, 0x10203
 ; GFX11-NEXT:    v_perm_b32 v0, 0, v1, 0x10203
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i64 %src to i48
   %bswap = call i48 @llvm.bswap.i48(i48 %trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 46b2f82d9de2a3f..b5bc4b64a680315 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -5,20 +5,20 @@
 define void @undef_lo_v2i16(i16 %arg0) {
 ; GFX9-LABEL: undef_lo_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x i16> undef, i16 %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.lo);
@@ -28,20 +28,20 @@ define void @undef_lo_v2i16(i16 %arg0) {
 define void @undef_lo_v2f16(half %arg0) {
 ; GFX9-LABEL: undef_lo_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x half> undef, half %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<2 x half> %undef.lo);
@@ -51,23 +51,23 @@ define void @undef_lo_v2f16(half %arg0) {
 define void @undef_lo_op_v2f16(half %arg0) {
 ; GFX9-LABEL: undef_lo_op_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo_op_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x3c00
 ; GFX8-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, 0x7e00, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x half> undef, half %arg0, i32 1
   %op = fadd <2 x half> %undef.lo, <half 1.0, half 1.0>
@@ -78,23 +78,23 @@ define void @undef_lo_op_v2f16(half %arg0) {
 define void @undef_lo_op_v2i16(i16 %arg0) {
 ; GFX9-LABEL: undef_lo_op_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x63
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo_op_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <2 x i16> undef, i16 %arg0, i32 1
   %op = add <2 x i16> %undef.lo, <i16 99, i16 99>
@@ -105,20 +105,20 @@ define void @undef_lo_op_v2i16(i16 %arg0) {
 define void @undef_lo3_v4i16(i16 %arg0) {
 ; GFX9-LABEL: undef_lo3_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo3_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <4 x i16> undef, i16 %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
@@ -128,20 +128,20 @@ define void @undef_lo3_v4i16(i16 %arg0) {
 define void @undef_lo3_v4f16(half %arg0) {
 ; GFX9-LABEL: undef_lo3_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo3_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = insertelement <4 x half> undef, half %arg0, i32 1
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo);
@@ -151,22 +151,22 @@ define void @undef_lo3_v4f16(half %arg0) {
 define void @undef_lo2_v4i16(<2 x i16> %arg0) {
 ; GFX9-LABEL: undef_lo2_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo2_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = shufflevector <2 x i16> %arg0, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo);
@@ -176,22 +176,22 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) {
 define void @undef_lo2_v4f16(<2 x half> %arg0) {
 ; GFX9-LABEL: undef_lo2_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_lo2_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.lo = shufflevector <2 x half> %arg0, <2 x half> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo);
@@ -201,18 +201,18 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) {
 define void @undef_hi_v2i16(i16 %arg0) {
 ; GFX9-LABEL: undef_hi_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <2 x i16> undef, i16 %arg0, i32 0
   call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.hi);
@@ -222,18 +222,18 @@ define void @undef_hi_v2i16(i16 %arg0) {
 define void @undef_hi_v2f16(half %arg0) {
 ; GFX9-LABEL: undef_hi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <2 x half> undef, half %arg0, i32 0
   call void asm sideeffect "; use $0", "v"(<2 x half> %undef.hi);
@@ -243,21 +243,21 @@ define void @undef_hi_v2f16(half %arg0) {
 define void @undef_hi_op_v2f16(half %arg0) {
 ; GFX9-LABEL: undef_hi_op_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi_op_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <2 x half> undef, half %arg0, i32 0
   %op = fadd <2 x half> %undef.hi, <half 1.0, half 1.0>
@@ -268,21 +268,21 @@ define void @undef_hi_op_v2f16(half %arg0) {
 define void @undef_hi_op_v2i16(i16 %arg0) {
 ; GFX9-LABEL: undef_hi_op_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x63
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v0
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi_op_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v0, 0x63, v0
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <2 x i16> undef, i16 %arg0, i32 0
   %op = add <2 x i16> %undef.hi, <i16 99, i16 99>
@@ -293,18 +293,18 @@ define void @undef_hi_op_v2i16(i16 %arg0) {
 define void @undef_hi3_v4i16(i16 %arg0) {
 ; GFX9-LABEL: undef_hi3_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi3_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <4 x i16> undef, i16 %arg0, i32 0
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.hi);
@@ -314,18 +314,18 @@ define void @undef_hi3_v4i16(i16 %arg0) {
 define void @undef_hi3_v4f16(half %arg0) {
 ; GFX9-LABEL: undef_hi3_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi3_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = insertelement <4 x half> undef, half %arg0, i32 0
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi);
@@ -335,18 +335,18 @@ define void @undef_hi3_v4f16(half %arg0) {
 define void @undef_hi2_v4i16(<2 x i16> %arg0) {
 ; GFX9-LABEL: undef_hi2_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi2_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = shufflevector <2 x i16> %arg0, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.hi);
@@ -356,18 +356,18 @@ define void @undef_hi2_v4i16(<2 x i16> %arg0) {
 define void @undef_hi2_v4f16(<2 x half> %arg0) {
 ; GFX9-LABEL: undef_hi2_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v[0:1]
 ; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: undef_hi2_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v[0:1]
 ; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %undef.hi = shufflevector <2 x half> %arg0, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi);
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index cb1b664549c9a30..ce20437032cfbde 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -7,11 +7,11 @@
 define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdiv64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_2
@@ -150,6 +150,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   ret i64 %d
@@ -158,11 +159,11 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-LABEL: udiv64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_2
@@ -286,6 +287,7 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i64 %a, %b
   ret i64 %d
@@ -294,11 +296,11 @@ define i64 @udiv64(i64 %a, i64 %b) {
 define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: srem64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_2
@@ -433,6 +435,7 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i64 %a, %b
   ret i64 %d
@@ -441,11 +444,11 @@ define i64 @srem64(i64 %a, i64 %b) {
 define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: urem64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_2
@@ -566,6 +569,7 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = urem i64 %a, %b
   ret i64 %d
@@ -574,7 +578,6 @@ define i64 @urem64(i64 %a, i64 %b) {
 define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-LABEL: sdiv32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -603,6 +606,7 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i32 %a, %b
   ret i32 %d
@@ -611,7 +615,6 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 define i32 @udiv32(i32 %a, i32 %b) {
 ; GFX9-LABEL: udiv32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -631,6 +634,7 @@ define i32 @udiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v2
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i32 %a, %b
   ret i32 %d
@@ -639,7 +643,6 @@ define i32 @udiv32(i32 %a, i32 %b) {
 define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-LABEL: srem32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -665,6 +668,7 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i32 %a, %b
   ret i32 %d
@@ -673,7 +677,6 @@ define i32 @srem32(i32 %a, i32 %b) {
 define i32 @urem32(i32 %a, i32 %b) {
 ; GFX9-LABEL: urem32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
@@ -691,6 +694,7 @@ define i32 @urem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = urem i32 %a, %b
   ret i32 %d
@@ -699,12 +703,12 @@ define i32 @urem32(i32 %a, i32 %b) {
 define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdivrem64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[10:11], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB8_2
@@ -859,6 +863,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   %r = srem i64 %a, %b
@@ -870,12 +875,12 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-LABEL: udivrem64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v5, v1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
 ; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_2
@@ -1011,6 +1016,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i64 %a, %b
   %r = urem i64 %a, %b
@@ -1022,7 +1028,6 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-LABEL: sdiv64_known32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
@@ -1034,6 +1039,7 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.ext = ashr i64 %a, 32
   %b.ext = ashr i64 %b, 32
@@ -1044,7 +1050,6 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 define i64 @udiv64_known32(i64 %a, i64 %b) {
 ; GFX9-LABEL: udiv64_known32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
@@ -1055,6 +1060,7 @@ define i64 @udiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %a.mask = and i64 %a, 4294967295
   %b.mask = and i64 %b, 4294967295
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..275a3718ef936bc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5642,7 +5642,6 @@ entry:
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; VI-LABEL: tail_call_byval_align16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; VI-NEXT:    s_getpc_b64 s[4:5]
@@ -5659,7 +5658,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; CI-LABEL: tail_call_byval_align16:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; CI-NEXT:    s_getpc_b64 s[4:5]
@@ -5676,7 +5674,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; GFX9-LABEL: tail_call_byval_align16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
@@ -5693,7 +5690,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; GFX11-LABEL: tail_call_byval_align16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
@@ -5707,7 +5703,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; HSA-LABEL: tail_call_byval_align16:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
 ; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; HSA-NEXT:    s_getpc_b64 s[4:5]
@@ -5730,7 +5725,6 @@ entry:
 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
 ; VI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
@@ -5747,7 +5741,6 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
 ;
 ; CI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
@@ -5764,7 +5757,6 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
 ;
 ; GFX9-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
@@ -5781,7 +5773,6 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
 ;
 ; GFX11-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32
 ; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:4
@@ -5796,7 +5787,6 @@ define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, do
 ;
 ; HSA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; HSA-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
@@ -5818,9 +5808,9 @@ entry:
 define void @stack_12xv3i32() #0 {
 ; VI-LABEL: stack_12xv3i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s33
 ; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -5886,9 +5876,9 @@ define void @stack_12xv3i32() #0 {
 ;
 ; CI-LABEL: stack_12xv3i32:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s4, s33
 ; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -5954,9 +5944,9 @@ define void @stack_12xv3i32() #0 {
 ;
 ; GFX9-LABEL: stack_12xv3i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6022,9 +6012,9 @@ define void @stack_12xv3i32() #0 {
 ;
 ; GFX11-LABEL: stack_12xv3i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6072,9 +6062,9 @@ define void @stack_12xv3i32() #0 {
 ;
 ; HSA-LABEL: stack_12xv3i32:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    s_mov_b32 s4, s33
 ; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_waitcnt expcnt(0)
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6157,9 +6147,9 @@ entry:
 define void @stack_12xv3f32() #0 {
 ; VI-LABEL: stack_12xv3f32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s33
 ; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6225,9 +6215,9 @@ define void @stack_12xv3f32() #0 {
 ;
 ; CI-LABEL: stack_12xv3f32:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s4, s33
 ; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6293,9 +6283,9 @@ define void @stack_12xv3f32() #0 {
 ;
 ; GFX9-LABEL: stack_12xv3f32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6361,9 +6351,9 @@ define void @stack_12xv3f32() #0 {
 ;
 ; GFX11-LABEL: stack_12xv3f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6415,9 +6405,9 @@ define void @stack_12xv3f32() #0 {
 ;
 ; HSA-LABEL: stack_12xv3f32:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    s_mov_b32 s4, s33
 ; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_waitcnt expcnt(0)
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6500,9 +6490,9 @@ entry:
 define void @stack_8xv5i32() #0 {
 ; VI-LABEL: stack_8xv5i32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s33
 ; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6576,9 +6566,9 @@ define void @stack_8xv5i32() #0 {
 ;
 ; CI-LABEL: stack_8xv5i32:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s4, s33
 ; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6652,9 +6642,9 @@ define void @stack_8xv5i32() #0 {
 ;
 ; GFX9-LABEL: stack_8xv5i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6728,9 +6718,9 @@ define void @stack_8xv5i32() #0 {
 ;
 ; GFX11-LABEL: stack_8xv5i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6783,9 +6773,9 @@ define void @stack_8xv5i32() #0 {
 ;
 ; HSA-LABEL: stack_8xv5i32:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    s_mov_b32 s4, s33
 ; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_waitcnt expcnt(0)
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6872,9 +6862,9 @@ entry:
 define void @stack_8xv5f32() #0 {
 ; VI-LABEL: stack_8xv5f32:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s33
 ; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; VI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -6948,9 +6938,9 @@ define void @stack_8xv5f32() #0 {
 ;
 ; CI-LABEL: stack_8xv5f32:
 ; CI:       ; %bb.0: ; %entry
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s4, s33
 ; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CI-NEXT:    s_mov_b64 exec, s[8:9]
@@ -7024,9 +7014,9 @@ define void @stack_8xv5f32() #0 {
 ;
 ; GFX9-LABEL: stack_8xv5f32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
@@ -7100,9 +7090,9 @@ define void @stack_8xv5f32() #0 {
 ;
 ; GFX11-LABEL: stack_8xv5f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7158,9 +7148,9 @@ define void @stack_8xv5f32() #0 {
 ;
 ; HSA-LABEL: stack_8xv5f32:
 ; HSA:       ; %bb.0: ; %entry
-; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; HSA-NEXT:    s_mov_b32 s4, s33
 ; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_waitcnt expcnt(0)
 ; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; HSA-NEXT:    s_mov_b64 exec, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index c0ba343d1dd358e..45c5ffc245bdb0a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -123,7 +123,6 @@ define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
 define void @tailcall_got_load(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-LABEL: tailcall_got_load:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, got.func at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, got.func at gotpcrel32@hi+12
@@ -139,7 +138,6 @@ define void @tailcall_got_load(ptr addrspace(1) %ptr, i32) #0 {
 define void @tail_call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
 ; GCN-LABEL: tail_call_memory_arg_load:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_read_b32 v0, v0
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func at rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
index a28ddc0779b068e..fafab13e5f2bd62 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir
@@ -25,7 +25,6 @@ body:             |
     ; GCN-LABEL: name: call_waw_waitcnt
     ; GCN: liveins: $sgpr4_sgpr5, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $sgpr30_sgpr31 = S_LOAD_DWORDX2_IMM $sgpr4_sgpr5, 0, 0
     ; GCN-NEXT: $sgpr33 = S_MOV_B32 killed $sgpr7
     ; GCN-NEXT: $flat_scr_lo = S_ADD_U32 killed $sgpr4, $sgpr33, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cb89841b58f9787..c0a39dd7400a1cd 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -94,8 +94,8 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
 define fastcc float @fastcc(float %arg0) #0 {
 ; GCN-LABEL: fastcc:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %arg0, 4.0
   ret float %add
@@ -104,8 +104,8 @@ define fastcc float @fastcc(float %arg0) #0 {
 define coldcc float @coldcc(float %arg0) #0 {
 ; GCN-LABEL: coldcc:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
  %add = fadd float %arg0, 4.0
  ret float %add
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 3005b17e0524c6a..9a070b07fab4566 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -72,9 +72,9 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
 ;
 ; GFX7-LABEL: test_sinkable_flat_small_offset_i32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -92,9 +92,9 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
 ;
 ; GFX8-LABEL: test_sinkable_flat_small_offset_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -112,9 +112,9 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
 ;
 ; GFX9-LABEL: test_sinkable_flat_small_offset_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -130,9 +130,9 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) {
 ;
 ; GFX10-LABEL: test_sinkable_flat_small_offset_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
@@ -227,10 +227,10 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in,
 ;
 ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -249,9 +249,9 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in,
 ;
 ; GFX8-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -269,9 +269,9 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in,
 ;
 ; GFX9-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -287,9 +287,9 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in,
 ;
 ; GFX10-LABEL: test_sink_noop_addrspacecast_flat_to_global_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
@@ -340,10 +340,10 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in
 ;
 ; GFX7-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -362,9 +362,9 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in
 ;
 ; GFX8-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -382,9 +382,9 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in
 ;
 ; GFX9-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -400,9 +400,9 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in
 ;
 ; GFX10-LABEL: test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
@@ -508,10 +508,10 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX7-LABEL: test_sink_flat_small_max_flat_offset:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, -1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -529,10 +529,10 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX8-LABEL: test_sink_flat_small_max_flat_offset:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -550,10 +550,10 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX9-LABEL: test_sink_flat_small_max_flat_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -569,10 +569,10 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX10-LABEL: test_sink_flat_small_max_flat_offset:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v4, -1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
@@ -629,10 +629,10 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX7-LABEL: test_sink_flat_small_max_plus_1_flat_offset:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v5, -1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -650,10 +650,10 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX8-LABEL: test_sink_flat_small_max_plus_1_flat_offset:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -671,10 +671,10 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX9-LABEL: test_sink_flat_small_max_plus_1_flat_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v5, -1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -692,10 +692,10 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ;
 ; GFX10-LABEL: test_sink_flat_small_max_plus_1_flat_offset:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v4, -1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
@@ -752,10 +752,10 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ;
 ; GFX7-LABEL: test_sinkable_flat_reg_offset:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v7, -1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX7-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
@@ -773,10 +773,10 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ;
 ; GFX8-LABEL: test_sinkable_flat_reg_offset:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v7, -1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
@@ -794,10 +794,10 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ;
 ; GFX9-LABEL: test_sinkable_flat_reg_offset:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v7, -1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
@@ -815,10 +815,10 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ;
 ; GFX10-LABEL: test_sinkable_flat_reg_offset:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v6, -1, 0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..93e38c2d637e564 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -9,51 +9,46 @@
 define <2 x half> @chain_hi_to_lo_private() {
 ; GFX900-LABEL: chain_hi_to_lo_private:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_private:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s0, 2
 ; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private:
 ; GFX10_DEFAULT:       ; %bb.0: ; %bb
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_clause 0x1
 ; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 2
 ; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, off, s0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; FLATSCR_GFX10-NEXT:    s_mov_b32 s0, 0
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v0, off, s0
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 2
 ; GFX11-NEXT:    scratch_load_u16 v0, off, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, off, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr addrspace(5) null, i64 1
@@ -69,45 +64,40 @@ bb:
 define <2 x half> @chain_hi_to_lo_private_different_bases(ptr addrspace(5) %base_lo, ptr addrspace(5) %base_hi) {
 ; GFX900-LABEL: chain_hi_to_lo_private_different_bases:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_ushort v0, v0, off
 ; FLATSCR-NEXT:    s_nop 0
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, v1, off
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases:
 ; GFX10_DEFAULT:       ; %bb.0: ; %bb
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_clause 0x1
 ; GFX10_DEFAULT-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, v0, off
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v0, v1, off
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_different_bases:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_u16 v0, v0, off
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v0, v1, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr addrspace(5) %base_lo
@@ -122,47 +112,47 @@ bb:
 define <2 x half> @chain_hi_to_lo_arithmatic(ptr addrspace(5) %base, half %in) {
 ; GFX900-LABEL: chain_hi_to_lo_arithmatic:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_arithmatic:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic:
 ; GFX10_DEFAULT:       ; %bb.0: ; %bb
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10_DEFAULT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR_GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_arithmatic:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %arith_lo = fadd half %in, 1.0
@@ -177,32 +167,29 @@ bb:
 define <2 x half> @chain_hi_to_lo_group() {
 ; GCN-LABEL: chain_hi_to_lo_group:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    ds_read_u16 v0, v1 offset:2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_group:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    ds_read_u16 v0, v1 offset:2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16_d16_hi v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_group:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    ds_load_u16 v0, v1 offset:2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16_d16_hi v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr addrspace(3) null, i64 1
@@ -218,29 +205,26 @@ bb:
 define <2 x half> @chain_hi_to_lo_group_different_bases(ptr addrspace(3) %base_lo, ptr addrspace(3) %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_group_different_bases:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_read_u16 v0, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_read_u16_d16_hi v0, v1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_group_different_bases:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16_d16_hi v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_group_different_bases:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16_d16_hi v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr addrspace(3) %base_lo
@@ -255,38 +239,35 @@ bb:
 define <2 x half> @chain_hi_to_lo_global() {
 ; GCN-LABEL: chain_hi_to_lo_global:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    global_load_ushort v0, v[0:1], off
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    global_load_short_d16_hi v0, v[1:2], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_global:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[1:2], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_global:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[1:2], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr addrspace(1) null, i64 1
@@ -302,27 +283,24 @@ bb:
 define <2 x half> @chain_hi_to_lo_global_different_bases(ptr addrspace(1) %base_lo, ptr addrspace(1) %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_global_different_bases:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v0, v[0:1], off
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_load_short_d16_hi v0, v[2:3], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_global_different_bases:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[2:3], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_global_different_bases:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr addrspace(1) %base_lo
@@ -337,7 +315,6 @@ bb:
 define <2 x half> @chain_hi_to_lo_flat() {
 ; GCN-LABEL: chain_hi_to_lo_flat:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1]
@@ -345,12 +322,11 @@ define <2 x half> @chain_hi_to_lo_flat() {
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_load_short_d16_hi v0, v[1:2]
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_flat:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    flat_load_ushort v0, v[0:1]
@@ -358,12 +334,11 @@ define <2 x half> @chain_hi_to_lo_flat() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_short_d16_hi v0, v[1:2]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_flat:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    flat_load_u16 v0, v[0:1]
@@ -371,7 +346,7 @@ define <2 x half> @chain_hi_to_lo_flat() {
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[1:2]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, ptr null, i64 1
@@ -387,29 +362,26 @@ bb:
 define <2 x half> @chain_hi_to_lo_flat_different_bases(ptr %base_lo, ptr %base_hi) {
 ; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1]
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_load_short_d16_hi v0, v[2:3]
-; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_flat_different_bases:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_short_d16_hi v0, v[2:3]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_flat_different_bases:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u16 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, ptr %base_lo
@@ -594,35 +566,35 @@ entry:
 define <2 x i16> @chain_hi_to_lo_group_other_dep(ptr addrspace(3) %ptr) {
 ; GCN-LABEL: chain_hi_to_lo_group_other_dep:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
 ; GCN-NEXT:    ds_read_u16_d16 v1, v0 offset:2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    ds_read_u16_d16 v1, v0 offset:2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16_d16_hi v1, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ds_load_u16_d16 v1, v0 offset:2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
@@ -638,45 +610,45 @@ bb:
 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(ptr addrspace(3) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v1, v0 offset:2
 ; GFX900-NEXT:    ds_read_u16_d16_hi v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    ds_read_u16 v1, v0 offset:2
 ; FLATSCR-NEXT:    ds_read_u16_d16_hi v0, v0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v1, v0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v1, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16_d16_hi v0, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v1, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16_d16_hi v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
@@ -691,57 +663,54 @@ bb:
 define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, 2, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    scratch_load_short_d16 v0, v2, off
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX10_DEFAULT:       ; %bb.0: ; %bb
-; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
 ; GFX10_DEFAULT-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
 ; GFX10_DEFAULT-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_DEFAULT-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10_DEFAULT-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10_DEFAULT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR_GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v0, v2, off
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    scratch_load_d16_b16 v0, v2, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
@@ -756,7 +725,6 @@ bb:
 define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_global_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
@@ -764,11 +732,11 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX900-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_global_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc
@@ -776,22 +744,22 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:2 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[0:1], off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_global_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v2, v[0:1], off offset:2 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[0:1], off glc dlc
@@ -799,6 +767,7 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
 ; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(1) %ptr, i64 1
@@ -813,7 +782,6 @@ bb:
 define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX900-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ushort v2, v[0:1] offset:2 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc
@@ -822,11 +790,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
+; GFX900-NEXT:    s_waitcnt expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_flat_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    flat_load_ushort v2, v[0:1] offset:2 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    flat_load_short_d16_hi v0, v[0:1] glc
@@ -835,11 +803,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; FLATSCR-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; FLATSCR-NEXT:    s_waitcnt expcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ushort v2, v[2:3] glc dlc
@@ -848,11 +816,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_flat_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u16 v2, v[0:1] offset:2 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_load_d16_hi_b16 v0, v[0:1] glc dlc
@@ -860,6 +828,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
 ; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(0) %ptr, i64 1
@@ -874,7 +843,6 @@ bb:
 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, ptr addrspace(3) %may.alias) {
 ; GFX900-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX900-NEXT:    ds_read_u16 v2, v0
 ; GFX900-NEXT:    ds_write_b16 v1, v3
@@ -882,11 +850,11 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; FLATSCR:       ; %bb.0: ; %bb
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; FLATSCR-NEXT:    ds_read_u16 v2, v0
 ; FLATSCR-NEXT:    ds_write_b16 v1, v3
@@ -894,28 +862,29 @@ define <2 x i16> @chain_hi_to_lo_group_may_alias_store(ptr addrspace(3) %ptr, pt
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; FLATSCR-NEXT:    v_perm_b32 v0, v2, v0, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX10-NEXT:    ds_read_u16 v3, v0
 ; GFX10-NEXT:    ds_write_b16 v1, v2
 ; GFX10-NEXT:    ds_read_u16 v0, v0 offset:2
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX11-NEXT:    ds_load_u16 v3, v0
 ; GFX11-NEXT:    ds_store_b16 v1, v2
 ; GFX11-NEXT:    ds_load_u16 v0, v0 offset:2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(3) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 23e04c517aee656..77bb2a30bf64082 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -1570,32 +1570,32 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 define <2 x half> @v_clamp_cvt_pkrtz_src_v2f16_denorm(float %a, float %b) #0 {
 ; SI-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1
 ; GFX8-NEXT:    v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e64 v0, v0, v0 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_pkrtz_f16_f32 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_clamp_cvt_pkrtz_src_v2f16_denorm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_pk_rtz_f16_f32_e64 v0, v0, v1 clamp
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %add, <2 x half> zeroinitializer)
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 8b1c5c3ae071cdd..2c6fe910382e409 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -1010,7 +1010,6 @@ bb.end:                                           ; preds = %bb.then, %bb
 define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-LABEL: scc_liveness:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_movk_i32 s4, 0x207
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GCN-NEXT:    s_mov_b32 s8, 0
@@ -1033,6 +1032,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_and_b64 s[10:11], exec, vcc
 ; GCN-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GCN-NEXT:    s_cbranch_execnz .LBB5_3
 ; GCN-NEXT:  ; %bb.4: ; %bb2
@@ -1075,12 +1075,12 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-O0-LABEL: scc_liveness:
 ; GCN-O0:       ; %bb.0: ; %bb
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -1393,7 +1393,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
 bb:
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
index 1be2fdec387b331..c727de21818aadb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll
@@ -7,7 +7,6 @@
 define i32 @combine_add_zext_xor() {
 ; GFX1010-LABEL: combine_add_zext_xor:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_branch .LBB0_2
 ; GFX1010-NEXT:  .LBB0_1: ; %bb9
@@ -29,11 +28,11 @@ define i32 @combine_add_zext_xor() {
 ; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX1010-NEXT:    s_branch .LBB0_1
 ; GFX1010-NEXT:  .LBB0_4: ; %.exit
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_xor:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_branch .LBB0_2
 ; GFX1100-NEXT:  .LBB0_1: ; %bb9
@@ -57,6 +56,7 @@ define i32 @combine_add_zext_xor() {
 ; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1100-NEXT:    s_branch .LBB0_1
 ; GFX1100-NEXT:  .LBB0_4: ; %.exit
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -87,7 +87,6 @@ bb9:                                              ; preds = %bb, %.a
 define i32 @combine_sub_zext_xor() {
 ; GFX1010-LABEL: combine_sub_zext_xor:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_branch .LBB1_2
 ; GFX1010-NEXT:  .LBB1_1: ; %bb9
@@ -109,11 +108,11 @@ define i32 @combine_sub_zext_xor() {
 ; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX1010-NEXT:    s_branch .LBB1_1
 ; GFX1010-NEXT:  .LBB1_4: ; %.exit
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_xor:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_branch .LBB1_2
 ; GFX1100-NEXT:  .LBB1_1: ; %bb9
@@ -137,6 +136,7 @@ define i32 @combine_sub_zext_xor() {
 ; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1100-NEXT:    s_branch .LBB1_1
 ; GFX1100-NEXT:  .LBB1_4: ; %.exit
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -167,7 +167,6 @@ bb9:                                              ; preds = %bb, %.a
 define i32 @combine_add_zext_or() {
 ; GFX1010-LABEL: combine_add_zext_or:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB2_2
 ; GFX1010-NEXT:  .LBB2_1: ; %bb9
@@ -191,11 +190,11 @@ define i32 @combine_add_zext_or() {
 ; GFX1010-NEXT:  .LBB2_4: ; %.exit
 ; GFX1010-NEXT:    s_or_b32 s4, s5, s6
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_or:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB2_2
 ; GFX1100-NEXT:  .LBB2_1: ; %bb9
@@ -220,6 +219,7 @@ define i32 @combine_add_zext_or() {
 ; GFX1100-NEXT:    s_or_b32 s0, s1, s2
 ; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -251,7 +251,6 @@ bb9:                                              ; preds = %bb, %.a
 define i32 @combine_sub_zext_or() {
 ; GFX1010-LABEL: combine_sub_zext_or:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_mov_b32 s4, 0
 ; GFX1010-NEXT:    s_branch .LBB3_2
 ; GFX1010-NEXT:  .LBB3_1: ; %bb9
@@ -275,11 +274,11 @@ define i32 @combine_sub_zext_or() {
 ; GFX1010-NEXT:  .LBB3_4: ; %.exit
 ; GFX1010-NEXT:    s_or_b32 s4, s5, s6
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_or:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_mov_b32 s0, 0
 ; GFX1100-NEXT:    s_branch .LBB3_2
 ; GFX1100-NEXT:  .LBB3_1: ; %bb9
@@ -304,6 +303,7 @@ define i32 @combine_sub_zext_or() {
 ; GFX1100-NEXT:    s_or_b32 s0, s1, s2
 ; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -335,7 +335,6 @@ bb9:                                              ; preds = %bb, %.a
 define i32 @combine_add_zext_and() {
 ; GFX1010-LABEL: combine_add_zext_and:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_branch .LBB4_2
 ; GFX1010-NEXT:  .LBB4_1: ; %bb9
@@ -356,11 +355,11 @@ define i32 @combine_add_zext_and() {
 ; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX1010-NEXT:    s_branch .LBB4_1
 ; GFX1010-NEXT:  .LBB4_4: ; %.exit
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_add_zext_and:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_branch .LBB4_2
 ; GFX1100-NEXT:  .LBB4_1: ; %bb9
@@ -383,6 +382,7 @@ define i32 @combine_add_zext_and() {
 ; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1100-NEXT:    s_branch .LBB4_1
 ; GFX1100-NEXT:  .LBB4_4: ; %.exit
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
@@ -414,7 +414,6 @@ bb9:                                              ; preds = %bb, %.a
 define i32 @combine_sub_zext_and() {
 ; GFX1010-LABEL: combine_sub_zext_and:
 ; GFX1010:       ; %bb.0: ; %.entry
-; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1010-NEXT:    s_branch .LBB5_2
 ; GFX1010-NEXT:  .LBB5_1: ; %bb9
@@ -435,11 +434,11 @@ define i32 @combine_sub_zext_and() {
 ; GFX1010-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX1010-NEXT:    s_branch .LBB5_1
 ; GFX1010-NEXT:  .LBB5_4: ; %.exit
+; GFX1010-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: combine_sub_zext_and:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1100-NEXT:    s_branch .LBB5_2
 ; GFX1100-NEXT:  .LBB5_1: ; %bb9
@@ -462,6 +461,7 @@ define i32 @combine_sub_zext_and() {
 ; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX1100-NEXT:    s_branch .LBB5_1
 ; GFX1100-NEXT:  .LBB5_4: ; %.exit
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   br label %.a
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 50e9494fc03df6d..a0fdb706632f235 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -44,7 +44,6 @@ bb:
 define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture %dst) {
 ; GCN-LABEL: add1_i16:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
@@ -58,11 +57,11 @@ define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v0, vcc
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add1_i16:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 2, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -72,6 +71,7 @@ define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v0, vcc
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index 738c9b4ed6034db..3292a20bf513e43 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -71,9 +71,8 @@ entry:
 define i32 @load_2xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi16_combine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
   %p.0 = load i16, ptr addrspace(1) %p, align 4
@@ -88,11 +87,11 @@ define i32 @load_2xi16_combine(ptr addrspace(1) %p) #0 {
 define i32 @load_2xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi16_noncombine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v2, v[0:1], off
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 2
   %p.0 = load i16, ptr addrspace(1) %p, align 4
@@ -107,9 +106,8 @@ define i32 @load_2xi16_noncombine(ptr addrspace(1) %p) #0 {
 define i64 @load_2xi32_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi32_combine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i32, ptr addrspace(1) %p, i32 1
   %p.0 = load i32, ptr addrspace(1) %p, align 4
@@ -124,13 +122,13 @@ define i64 @load_2xi32_combine(ptr addrspace(1) %p) #0 {
 define i64 @load_2xi32_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_2xi32_noncombine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v2, v[0:1], off
 ; GCN-NEXT:    global_load_dword v3, v[0:1], off offset:8
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i32, ptr addrspace(1) %p, i32 2
   %p.0 = load i32, ptr addrspace(1) %p, align 4
@@ -145,9 +143,8 @@ define i64 @load_2xi32_noncombine(ptr addrspace(1) %p) #0 {
 define i64 @load_4xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_4xi16_combine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
   %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
@@ -173,13 +170,13 @@ define i64 @load_4xi16_combine(ptr addrspace(1) %p) #0 {
 define i64 @load_4xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_4xi16_noncombine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v2, v3
 ; GCN-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3
   %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
@@ -204,13 +201,13 @@ define i64 @load_4xi16_noncombine(ptr addrspace(1) %p) #0 {
 define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_3xi16_combine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v2, v[0:1], off
 ; GCN-NEXT:    global_load_ushort v3, v[0:1], off offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 1
   %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
@@ -230,13 +227,13 @@ define i64 @load_3xi16_combine(ptr addrspace(1) %p) #0 {
 define i64 @load_3xi16_noncombine(ptr addrspace(1) %p) #0 {
 ; GCN-LABEL: load_3xi16_noncombine:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v2, v[0:1], off
 ; GCN-NEXT:    global_load_dword v3, v[0:1], off offset:4
 ; GCN-NEXT:    s_mov_b32 s4, 0x3020504
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_perm_b32 v0, v2, v3, s4
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3
   %gep.2p = getelementptr i16, ptr addrspace(1) %p, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 8082a0646d4a139..d14ad91ec20fbcb 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -9,10 +9,10 @@
 define i1 @test1(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -23,10 +23,10 @@ define i1 @test1(i32 %arg1, i32 %arg2) {
 define i1 @test2(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -37,10 +37,10 @@ define i1 @test2(i32 %arg1, i32 %arg2) {
 define i1 @test3(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -51,10 +51,10 @@ define i1 @test3(i32 %arg1, i32 %arg2) {
 define i1 @test4(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test4:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -65,10 +65,10 @@ define i1 @test4(i32 %arg1, i32 %arg2) {
 define i1 @test5(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test5:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -79,10 +79,10 @@ define i1 @test5(i32 %arg1, i32 %arg2) {
 define i1 @test6(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test6:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -93,10 +93,10 @@ define i1 @test6(i32 %arg1, i32 %arg2) {
 define i1 @test7(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test7:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -107,10 +107,10 @@ define i1 @test7(i32 %arg1, i32 %arg2) {
 define i1 @test8(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test8:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -121,10 +121,10 @@ define i1 @test8(i32 %arg1, i32 %arg2) {
 define i1 @test9(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test9:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -135,10 +135,10 @@ define i1 @test9(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test10(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test10:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -149,10 +149,10 @@ define i1 @test10(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test11(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test11:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -163,10 +163,10 @@ define i1 @test11(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test12(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test12:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -177,10 +177,10 @@ define i1 @test12(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test13(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test13:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -191,10 +191,10 @@ define i1 @test13(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test14(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test14:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -205,10 +205,10 @@ define i1 @test14(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test15(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test15:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -219,10 +219,10 @@ define i1 @test15(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test16(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -233,10 +233,10 @@ define i1 @test16(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test17(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test17:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i32 %arg2, 1000
@@ -247,10 +247,10 @@ define i1 @test17(i32 %arg1, i32 %arg2) {
 define i1 @test18(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test18:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 1000
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -261,10 +261,10 @@ define i1 @test18(i32 %arg1, i32 %arg2) {
 define i1 @test19(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test19:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0x3e9, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -275,10 +275,10 @@ define i1 @test19(i32 %arg1, i32 %arg2) {
 define i1 @test20(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test20:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x3e9, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -289,10 +289,10 @@ define i1 @test20(i32 %arg1, i32 %arg2) {
 define i1 @test21(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test21:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -303,10 +303,10 @@ define i1 @test21(i32 %arg1, i32 %arg2) {
 define i1 @test22(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test22:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, 1000
   %cmp2 = icmp ugt i32 %arg2, 1000
@@ -317,10 +317,10 @@ define i1 @test22(i32 %arg1, i32 %arg2) {
 define i1 @test23(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test23:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 0x3e7, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -331,10 +331,10 @@ define i1 @test23(i32 %arg1, i32 %arg2) {
 define i1 @test24(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test24:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3e7, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, 1000
   %cmp2 = icmp uge i32 %arg2, 1000
@@ -345,10 +345,10 @@ define i1 @test24(i32 %arg1, i32 %arg2) {
 define i1 @test25(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -359,10 +359,10 @@ define i1 @test25(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test26(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test26:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -373,10 +373,10 @@ define i1 @test26(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test27(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test27:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_le_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -387,10 +387,10 @@ define i1 @test27(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test28(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test28:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -401,10 +401,10 @@ define i1 @test28(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test29(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test29:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -415,10 +415,10 @@ define i1 @test29(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test30(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test30:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg1, %arg3
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -429,10 +429,10 @@ define i1 @test30(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test31(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test31:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -443,10 +443,10 @@ define i1 @test31(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test32(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -457,10 +457,10 @@ define i1 @test32(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test33(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test33:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v1, 0x3e8, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg2
   %cmp2 = icmp slt i32 %arg1, 1000
@@ -471,7 +471,6 @@ define i1 @test33(i32 %arg1, i32 %arg2) {
 define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test34:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_i32 s0, 0x3e9
@@ -480,6 +479,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, 1000
   %cmp2 = icmp sle i32 %arg2, 1000
@@ -491,7 +491,6 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test35:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e8
@@ -500,6 +499,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, 1000
   %cmp2 = icmp sgt i32 %arg2, 1000
@@ -511,7 +511,6 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3) {
 ; GCN-LABEL: test36:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_lt_u32 s0, s6
@@ -520,6 +519,7 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -531,7 +531,6 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3) {
 ; GCN-LABEL: test37:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_ge_i32 s0, s6
@@ -540,6 +539,7 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, %arg3
   %cmp2 = icmp sge i32 %arg2, %arg3
@@ -551,7 +551,6 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test38:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_u32 s0, 0x3e9
@@ -560,6 +559,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg1, 1000
   %cmp2 = icmp ule i32 %arg2, 1000
@@ -571,7 +571,6 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test39:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e7
@@ -580,6 +579,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sge i32 %arg1, 1000
   %cmp2 = icmp sge i32 %arg2, 1000
@@ -591,7 +591,6 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3) {
 ; GCN-LABEL: test40:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_max_i32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_le_i32 s0, s6
@@ -600,6 +599,7 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sle i32 %arg1, %arg3
   %cmp2 = icmp sle i32 %arg2, %arg3
@@ -611,7 +611,6 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3) {
 ; GCN-LABEL: test41:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s0, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s6
@@ -620,6 +619,7 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GCN-NEXT:    global_store_b8 v[0:1], v2, off dlc
 ; GCN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp uge i32 %arg1, %arg3
   %cmp2 = icmp uge i32 %arg2, %arg3
@@ -631,10 +631,10 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 define i1 @test42(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test42:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -645,10 +645,10 @@ define i1 @test42(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test43(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test43:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg3, %arg1
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -659,10 +659,10 @@ define i1 @test43(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test44(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test44:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -673,10 +673,10 @@ define i1 @test44(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test45(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test45:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ugt i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg3, %arg2
@@ -687,10 +687,10 @@ define i1 @test45(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test46(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test46:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -701,10 +701,10 @@ define i1 @test46(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test47(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test47:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -715,10 +715,10 @@ define i1 @test47(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test48(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test48:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -729,10 +729,10 @@ define i1 @test48(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test49(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test49:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -743,10 +743,10 @@ define i1 @test49(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test50(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test50:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg3, %arg1
   %cmp2 = icmp sgt i32 %arg2, %arg3
@@ -757,10 +757,10 @@ define i1 @test50(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test51(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test51:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg1, %arg3
   %cmp2 = icmp slt i32 %arg3, %arg2
@@ -771,10 +771,10 @@ define i1 @test51(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test52(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test52:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, %arg3
   %cmp2 = icmp sgt i32 %arg3, %arg2
@@ -785,10 +785,10 @@ define i1 @test52(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test53(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test53:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp sgt i32 %arg3, %arg1
   %cmp2 = icmp slt i32 %arg2, %arg3
@@ -799,10 +799,10 @@ define i1 @test53(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test54(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-LABEL: test54:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -813,10 +813,10 @@ define i1 @test54(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test55(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-LABEL: test55:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -827,10 +827,10 @@ define i1 @test55(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test56(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-LABEL: test56:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -841,10 +841,10 @@ define i1 @test56(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test57(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-LABEL: test57:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -855,18 +855,18 @@ define i1 @test57(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test58:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test58:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -877,18 +877,18 @@ define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test59:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test59:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -899,18 +899,18 @@ define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test60:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test60:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -921,18 +921,18 @@ define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test61:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test61:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -943,11 +943,11 @@ define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test62(float %arg1, float %arg2, float %arg3) {
 ; GCN-LABEL: test62:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -960,12 +960,12 @@ define i1 @test62(float %arg1, float %arg2, float %arg3) {
 define i1 @test63(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-LABEL: test63:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -978,12 +978,12 @@ define i1 @test63(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test64(double %arg1, double %arg2, double %arg3) #0 {
 ; GCN-LABEL: test64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -996,11 +996,11 @@ define i1 @test64(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test65(float %arg1, float %arg2, float %arg3) {
 ; GCN-LABEL: test65:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1013,12 +1013,12 @@ define i1 @test65(float %arg1, float %arg2, float %arg3) {
 define i1 @test66(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test66:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1031,11 +1031,11 @@ define i1 @test66(double %arg1, double %arg2, double %arg3) {
 define i1 @test67(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-LABEL: test67:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1048,11 +1048,11 @@ define i1 @test67(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test68(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-LABEL: test68:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1065,12 +1065,12 @@ define i1 @test68(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test69(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test69:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1083,19 +1083,19 @@ define i1 @test69(double %arg1, double %arg2, double %arg3) {
 define i1 @test70(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test70:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test70:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1108,12 +1108,12 @@ define i1 @test70(float %arg1, float %arg2, float %arg3) {
 define i1 @test71(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test71:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1126,12 +1126,12 @@ define i1 @test71(double %arg1, double %arg2, double %arg3) {
 define i1 @test72(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test72:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1144,19 +1144,19 @@ define i1 @test72(double %arg1, double %arg2, double %arg3) {
 define i1 @test73(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test73:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test73:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1169,22 +1169,22 @@ define i1 @test73(float %arg1, float %arg2, float %arg3) {
 define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test74:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test74:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1197,19 +1197,19 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 define i1 @test75(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test75:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test75:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1222,19 +1222,19 @@ define i1 @test75(float %arg1, float %arg2, float %arg3) {
 define i1 @test76(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test76:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test76:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1247,22 +1247,22 @@ define i1 @test76(float %arg1, float %arg2, float %arg3) {
 define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test77:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test77:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1275,10 +1275,10 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 define i1 @test78(float %arg1, float %arg2, float %arg3) #0 {
 ; GCN-LABEL: test78:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -1289,18 +1289,18 @@ define i1 @test78(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test79:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test79:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -1311,11 +1311,11 @@ define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test80(float %arg1, float %arg2, float %arg3) {
 ; GCN-LABEL: test80:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v1, 2.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, 1.0
   %add2 = fadd nnan float %arg2, 2.0
@@ -1328,12 +1328,12 @@ define i1 @test80(float %arg1, float %arg2, float %arg3) {
 define i1 @test81(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test81:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 2.0
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan double %arg1, 1.0
   %add2 = fadd nnan double %arg2, 2.0
@@ -1346,12 +1346,12 @@ define i1 @test81(double %arg1, double %arg2, double %arg3) {
 define i1 @test82(double %arg1, double %arg2, double %arg3) {
 ; GCN-LABEL: test82:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -1364,19 +1364,19 @@ define i1 @test82(double %arg1, double %arg2, double %arg3) {
 define i1 @test83(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test83:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test83:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -1389,20 +1389,20 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
 define i1 @test84(half %arg1, half %arg2, half %arg3) {
 ; GFX11-LABEL: test84:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test84:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1415,7 +1415,6 @@ define i1 @test84(half %arg1, half %arg2, half %arg3) {
 define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-LABEL: test85:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
@@ -1425,11 +1424,11 @@ define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test85:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -1437,6 +1436,7 @@ define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
@@ -1449,7 +1449,6 @@ define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-LABEL: test86:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
@@ -1459,11 +1458,11 @@ define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test86:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -1471,6 +1470,7 @@ define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
@@ -1483,20 +1483,20 @@ define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 define i1 @test87(half %arg1, half %arg2, half %arg3) {
 ; GFX11-LABEL: test87:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test87:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1509,7 +1509,6 @@ define i1 @test87(half %arg1, half %arg2, half %arg3) {
 define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-LABEL: test88:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
@@ -1519,11 +1518,11 @@ define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test88:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -1531,6 +1530,7 @@ define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
@@ -1543,20 +1543,20 @@ define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 define i1 @test89(half %arg1, half %arg2, half %arg3) {
 ; GFX11-LABEL: test89:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test89:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1569,20 +1569,20 @@ define i1 @test89(half %arg1, half %arg2, half %arg3) {
 define i1 @test90(half %arg1, half %arg2, half %arg3) {
 ; GFX11-LABEL: test90:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test90:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
@@ -1595,7 +1595,6 @@ define i1 @test90(half %arg1, half %arg2, half %arg3) {
 define <2 x i1> @test91(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-LABEL: test91:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
@@ -1605,11 +1604,11 @@ define <2 x i1> @test91(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test91:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
@@ -1617,6 +1616,7 @@ define <2 x i1> @test91(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v1
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
@@ -1629,10 +1629,10 @@ define <2 x i1> @test91(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 define i1 @test92(i32 %arg1, i32 %arg2, i32 %arg3, i32 %C) {
 ; GCN-LABEL: test92:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1645,13 +1645,13 @@ define i1 @test92(i32 %arg1, i32 %arg2, i32 %arg3, i32 %C) {
 define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test93:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max_u32_e32 v1, v2, v3
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v4
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1666,13 +1666,13 @@ define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test94(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %C) {
 ; GCN-LABEL: test94:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GCN-NEXT:    v_min3_u32 v0, v5, v6, v0
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1695,10 +1695,10 @@ define i1 @test94(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %ar
 define i1 @test95(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test95:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1711,10 +1711,10 @@ define i1 @test95(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test96(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test96:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1727,11 +1727,11 @@ define i1 @test96(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test97(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test97:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_u32 v0, v0, v2, v3
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1746,11 +1746,11 @@ define i1 @test97(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test98(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test98:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1765,11 +1765,11 @@ define i1 @test98(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test99(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test99:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1784,11 +1784,11 @@ define i1 @test99(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test100(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 ; GCN-LABEL: test100:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1803,12 +1803,12 @@ define i1 @test100(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) {
 define i1 @test101(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %C) {
 ; GCN-LABEL: test101:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_minmax_u32 v1, v3, v4, v5
 ; GCN-NEXT:    v_min3_u32 v0, v0, v2, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1827,12 +1827,12 @@ define i1 @test101(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test102(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %C) {
 ; GCN-LABEL: test102:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_min_u32_e32 v1, v2, v3
 ; GCN-NEXT:    v_min3_u32 v0, v0, v5, v1
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1852,7 +1852,6 @@ define i1 @test102(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %C) {
 ; GCN-LABEL: test103:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v4, v4, v5
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_u32 v0, v0, v1, v4
@@ -1860,6 +1859,7 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v0, v6
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1879,7 +1879,6 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %C) {
 ; GCN-LABEL: test104:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GCN-NEXT:    v_max_u32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min_u32_e32 v3, v4, v5
@@ -1893,6 +1892,7 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1919,7 +1919,6 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %C) {
 ; GCN-LABEL: test105:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max_u32_e32 v1, v2, v3
 ; GCN-NEXT:    v_max_u32_e32 v2, v4, v5
@@ -1932,6 +1931,7 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s1, s2, s1
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C
   %cmp2 = icmp ult i32 %arg2, %C
@@ -1954,7 +1954,6 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %C1, i32 %C2) {
 ; GCN-LABEL: test106:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GCN-NEXT:    v_min_u32_e32 v0, v0, v1
 ; GCN-NEXT:    v_min_u32_e32 v1, v10, v11
@@ -1969,6 +1968,7 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    s_or_b32 s0, s2, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %C1
   %cmp2 = icmp ult i32 %arg2, %C1
@@ -1999,10 +1999,10 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a
 define i1 @test107(float %arg1, float %arg2, float %arg3, float %C) {
 ; GCN-LABEL: test107:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2015,18 +2015,18 @@ define i1 @test107(float %arg1, float %arg2, float %arg3, float %C) {
 define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-LABEL: test108:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test108:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2039,7 +2039,6 @@ define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
 define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C) {
 ; GFX11-LABEL: test109:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
 ; GFX11-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
@@ -2047,16 +2046,17 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test109:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
 ; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2071,7 +2071,6 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1, float %C2, float %C3, float %C4, float %C) #0 {
 ; GCN-LABEL: test110:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_dual_max_f32 v0, v0, v1 :: v_dual_min_f32 v1, v2, v3
@@ -2079,6 +2078,7 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 ; GCN-NEXT:    v_cmp_gt_f32_e64 s0, v1, v8
 ; GCN-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2097,7 +2097,6 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %C) {
 ; GFX11-LABEL: test111:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
 ; GFX11-NEXT:    v_dual_min_f32 v2, v2, v3 :: v_dual_max_f32 v3, v4, v4
 ; GFX11-NEXT:    v_min3_f32 v0, v0, v1, v2
@@ -2105,17 +2104,18 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_min3_f32 v0, v5, v6, v0
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test111:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2138,7 +2138,6 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %C) {
 ; GFX11-LABEL: test112:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v4, v8
 ; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3
@@ -2148,17 +2147,18 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v0, v8
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test112:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2181,21 +2181,21 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-LABEL: test113:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test113:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_maxmin_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
@@ -2208,23 +2208,23 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
 define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-LABEL: test114:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v2, v3
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test114:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
 ; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
 ; GFX11NONANS-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt float %arg1, %C
   %cmp2 = fcmp ogt float %arg2, %C
@@ -2237,7 +2237,6 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C) {
 ; GFX11-LABEL: test115:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
@@ -2245,15 +2244,16 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cmp_nge_f32_e64 s0, v1, v4
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test115:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2270,7 +2270,6 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, float %arg10, float %C) {
 ; GFX11-LABEL: test116:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
@@ -2288,11 +2287,11 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test116:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v8, v8, v9
 ; GFX11NONANS-NEXT:    v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v4, v6, v7
@@ -2305,6 +2304,7 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s1, s2, vcc_lo
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
@@ -2331,7 +2331,6 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, float %arg8, float %arg9, float %arg10, float %arg11, float %arg12, float %C1, float %C2) {
 ; GFX11-LABEL: test117:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v6, v6, v6
 ; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v10, v10, v10
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
@@ -2349,11 +2348,11 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_or_b32 s0, s2, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test117:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v6, v6, v7
 ; GFX11NONANS-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
@@ -2367,6 +2366,7 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11NONANS-NEXT:    s_or_b32 s0, s2, s0
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C1
   %cmp2 = fcmp olt float %arg2, %C1
@@ -2398,13 +2398,13 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 define i1 @test118(float %arg1, float %arg2, float %arg3, float %arg4, float %C1, float %C2, float %C3, float %C4, float %C) #0 {
 ; GCN-LABEL: test118:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_max3_f32 v0, v0, v2, v3
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2423,13 +2423,13 @@ define i1 @test118(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 define i1 @test119(float %arg1, float %arg2, float %arg3, float %arg4, float %C1, float %C2, float %C3, float %C4, float %C) #0 {
 ; GCN-LABEL: test119:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_minmax_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2448,13 +2448,13 @@ define i1 @test119(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 define i1 @test120(float %arg1, float %arg2, float %arg3, float %arg4, float %C1, float %C2, float %C3, float %C4, float %C) #0 {
 ; GCN-LABEL: test120:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2473,13 +2473,13 @@ define i1 @test120(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 define i1 @test121(float %arg1, float %arg2, float %arg3, float %arg4, float %C1, float %C2, float %C3, float %C4, float %C) #0 {
 ; GCN-LABEL: test121:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
 ; GCN-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GCN-NEXT:    v_maxmin_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add1 = fadd nnan float %arg1, %C1
   %add2 = fadd nnan float %arg2, %C2
@@ -2498,10 +2498,10 @@ define i1 @test121(float %arg1, float %arg2, float %arg3, float %arg4, float %C1
 define i1 @test122(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN-LABEL: test122:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2512,12 +2512,12 @@ define i1 @test122(double %arg1, double %arg2, double %arg3) #1 {
 define i1 @test123(double %arg1, double %arg2, double %arg3) #1 {
 ; GCN-LABEL: test123:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GCN-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GCN-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2532,11 +2532,11 @@ define i1 @test123(double %arg1, double %arg2, double %arg3) #1 {
 define i1 @test124(i32 %arg1, i64 %arg2) {
 ; GCN-LABEL: test124:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
 ; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp slt i32 %arg1, 1000
   %cmp2 = icmp slt i64 %arg2, 1000
@@ -2547,11 +2547,11 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
 define i1 @test125(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test125:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp eq i32 %arg1, 1000
   %cmp2 = icmp eq i32 %arg2, 1000
@@ -2562,11 +2562,11 @@ define i1 @test125(i32 %arg1, i32 %arg2) {
 define i1 @test126(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test126:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ne i32 %arg1, 1000
   %cmp2 = icmp ne i32 %arg2, 1000
@@ -2577,11 +2577,11 @@ define i1 @test126(i32 %arg1, i32 %arg2) {
 define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) {
 ; GCN-LABEL: test127:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
 ; GCN-NEXT:    v_cmp_lt_u64_e64 s0, v[2:3], v[4:5]
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
    %cmp1 = icmp ult i64 %arg1, %arg3
    %cmp2 = icmp ult i64 %arg2, %arg3
@@ -2592,11 +2592,11 @@ define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) {
 define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test128:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v2, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg3, %arg2
@@ -2607,11 +2607,11 @@ define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test129:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_le_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ule i32 %arg2, %arg3
@@ -2622,11 +2622,11 @@ define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
 ; GCN-LABEL: test130:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc_lo, v2, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, v1, v2
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ule i32 %arg3, %arg1
   %cmp2 = icmp ugt i32 %arg2, %arg3
@@ -2637,11 +2637,11 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
 define i1 @test131(i16 %arg1, i32 %arg2) {
 ; GCN-LABEL: test131:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i16 %arg1, 10
   %cmp2 = icmp ult i32 %arg2, 10
@@ -2652,7 +2652,6 @@ define i1 @test131(i16 %arg1, i32 %arg2) {
 define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 ; GCN-LABEL: test132:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v0, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s0, v1, v2
 ; GCN-NEXT:    v_cmp_lt_u32_e64 s1, v0, v3
@@ -2660,6 +2659,7 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 ; GCN-NEXT:    s_or_b32 s1, s1, vcc_lo
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, %arg3
   %cmp2 = icmp ult i32 %arg2, %arg3
@@ -2673,11 +2673,11 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
 define i1 @test133(i32 %arg1, i32 %arg2) {
 ; GCN-LABEL: test133:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 0x3e8, v1
 ; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i32 %arg1, 100
   %cmp2 = icmp ult i32 %arg2, 1000
@@ -2688,19 +2688,19 @@ define i1 @test133(i32 %arg1, i32 %arg2) {
 define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test134:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_gt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test134:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
@@ -2711,19 +2711,19 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test135:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nle_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test135:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
@@ -2734,23 +2734,23 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test136:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_ge_f64_e64 s0, v[4:5], v[2:3]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test136:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -2763,20 +2763,20 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 define i1 @test137(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test137:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v2, v1
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test137:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -2789,19 +2789,19 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) {
 define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test138:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test138:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
@@ -2812,19 +2812,19 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test139:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test139:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
@@ -2835,19 +2835,19 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test140:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test140:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
@@ -2858,19 +2858,19 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test141:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test141:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
@@ -2881,19 +2881,19 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test142:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test142:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
@@ -2904,19 +2904,19 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test143:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test143:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
@@ -2927,19 +2927,19 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-LABEL: test144:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test144:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
@@ -2950,19 +2950,19 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
 define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-LABEL: test145:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test145:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
@@ -2973,20 +2973,20 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
 define i1 @test146(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test146:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test146:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -2999,23 +2999,23 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) {
 define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test147:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_le_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test147:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3028,23 +3028,23 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test148:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test148:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3057,20 +3057,20 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 define i1 @test149(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test149:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test149:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3083,23 +3083,23 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) {
 define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test150:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nle_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test150:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
@@ -3112,20 +3112,20 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 define i1 @test151(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test151:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_nlt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test151:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3138,20 +3138,20 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) {
 define i1 @test152(float %arg1, float %arg2, float %arg3) {
 ; GFX11-LABEL: test152:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v0, v2
 ; GFX11-NEXT:    v_cmp_ngt_f32_e64 s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test152:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
@@ -3164,23 +3164,23 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) {
 define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11-LABEL: test153:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11-NEXT:    v_cmp_nge_f64_e64 s0, v[2:3], v[4:5]
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11NONANS-LABEL: test153:
 ; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
 ; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
index 11795cca18daa25..0bb56adedac9732 100644
--- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
@@ -4,10 +4,10 @@
 define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %A = sext i8 %X to i48
   %B = sext i8 %Y to i48
@@ -23,12 +23,12 @@ define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 define i16 @num_sign_bits_mul_i48_1(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %A = sext i8 %X to i48
   %B = sext i8 %Y to i48
@@ -45,7 +45,6 @@ define i16 @num_sign_bits_mul_i48_1(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 define i32 @num_sign_bits_mul_i32_7(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-LABEL: num_sign_bits_mul_i32_7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 25
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 25
@@ -53,6 +52,7 @@ define i32 @num_sign_bits_mul_i32_7(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.shl = shl i32 %x, 7
   %x.bits = ashr i32 %x.shl, 7
@@ -75,10 +75,10 @@ define i32 @num_sign_bits_mul_i32_7(i32 %x, i32 %y, i32 %z, i32 %w) {
 define i32 @num_sign_bits_mul_i32_8(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-LABEL: num_sign_bits_mul_i32_8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.shl = shl i32 %x, 8
   %x.bits = ashr i32 %x.shl, 8
@@ -101,7 +101,6 @@ define i32 @num_sign_bits_mul_i32_8(i32 %x, i32 %y, i32 %z, i32 %w) {
 define i32 @num_sign_bits_mul_i32_9(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-LABEL: num_sign_bits_mul_i32_9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 23
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 23
@@ -109,6 +108,7 @@ define i32 @num_sign_bits_mul_i32_9(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.shl = shl i32 %x, 9
   %x.bits = ashr i32 %x.shl, 9
@@ -131,7 +131,6 @@ define i32 @num_sign_bits_mul_i32_9(i32 %x, i32 %y, i32 %z, i32 %w) {
 define i32 @num_sign_bits_mul_i32_10(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-LABEL: num_sign_bits_mul_i32_10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 22
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 22
 ; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 22
@@ -139,6 +138,7 @@ define i32 @num_sign_bits_mul_i32_10(i32 %x, i32 %y, i32 %z, i32 %w) {
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v2, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %x.shl = shl i32 %x, 10
   %x.bits = ashr i32 %x.shl, 10
@@ -161,8 +161,8 @@ define i32 @num_sign_bits_mul_i32_10(i32 %x, i32 %y, i32 %z, i32 %w) {
 define i32 @known_bits_mul24() {
 ; GFX9-LABEL: known_bits_mul24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %r0 = call i32 @llvm.amdgcn.mul.i24(i32 0, i32 -7)
   %r1 = shl i32 %r0, 2
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index 53c9861b7a0516b..7ce1dea3c08a4e7 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -5,23 +5,23 @@
 define i16 @csh_16(i16 %a, i16 %b) {
 ; CHECK-LABEL: csh_16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b16_e32 v2, v1, v0
 ; CHECK-NEXT:    v_lshrrev_b16_e32 v3, v1, v0
 ; CHECK-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
 ; CHECK-NEXT:    v_add_u16_e32 v1, v2, v3
 ; CHECK-NEXT:    v_add_u16_e32 v0, v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: csh_16:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GISEL-NEXT:    v_lshlrev_b16_e32 v2, v1, v0
 ; GISEL-NEXT:    v_lshrrev_b16_e32 v3, v1, v0
 ; GISEL-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
 ; GISEL-NEXT:    v_add_u16_e32 v1, v2, v3
 ; GISEL-NEXT:    v_add_u16_e32 v0, v1, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and i16 %b, 15
   %shl = shl i16 %a, %and
@@ -35,21 +35,21 @@ define i16 @csh_16(i16 %a, i16 %b) {
 define i32 @csh_32(i32 %a, i32 %b) {
 ; CHECK-LABEL: csh_32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v3, v1, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
 ; CHECK-NEXT:    v_add3_u32 v0, v2, v3, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: csh_32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 31, v1
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, v1, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v0, v1, v0
 ; GISEL-NEXT:    v_add3_u32 v0, v2, v3, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %b, 31
   %shl = shl i32 %a, %and
@@ -117,7 +117,6 @@ define amdgpu_ps i32 @s_csh_32_1(i32 inreg %a, i32 inreg %b) {
 define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: csh_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v8, v7, v3
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v9, v6, v2
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v10, v5, v1
@@ -134,11 +133,11 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    v_add3_u32 v1, v10, v14, v1
 ; CHECK-NEXT:    v_add3_u32 v2, v9, v13, v2
 ; CHECK-NEXT:    v_add3_u32 v3, v8, v12, v3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: csh_v4i32:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v4, 31, v4
 ; GISEL-NEXT:    v_and_b32_e32 v5, 31, v5
 ; GISEL-NEXT:    v_and_b32_e32 v6, 31, v6
@@ -159,6 +158,7 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; GISEL-NEXT:    v_add3_u32 v1, v9, v13, v1
 ; GISEL-NEXT:    v_add3_u32 v2, v10, v14, v2
 ; GISEL-NEXT:    v_add3_u32 v3, v11, v15, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
   %shl = shl <4 x i32> %a, %and
@@ -233,7 +233,6 @@ define amdgpu_ps <4 x i32> @s_csh_v4i32(<4 x i32> inreg %a, <4 x i32> inreg %b)
 define i64 @csh_64(i64 %a, i64 %b) {
 ; CHECK-LABEL: csh_64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b64 v[3:4], v2, v[0:1]
 ; CHECK-NEXT:    v_lshrrev_b64 v[5:6], v2, v[0:1]
 ; CHECK-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
@@ -241,11 +240,11 @@ define i64 @csh_64(i64 %a, i64 %b) {
 ; CHECK-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
 ; CHECK-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: csh_64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v6, 63, v2
 ; GISEL-NEXT:    v_lshlrev_b64 v[2:3], v6, v[0:1]
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v6, v[0:1]
@@ -254,6 +253,7 @@ define i64 @csh_64(i64 %a, i64 %b) {
 ; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
 ; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and i64 %b, 63
   %shl = shl i64 %a, %and
@@ -331,15 +331,15 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 define i32 @cshl_or(i32 %a, i32 %b) {
 ; CHECK-LABEL: cshl_or:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_or_b32 v0, v0, v1, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: cshl_or:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 31, v1
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v0, v1, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %b, 31
   %shl = shl i32 %a, %and
@@ -350,15 +350,15 @@ define i32 @cshl_or(i32 %a, i32 %b) {
 define i32 @cshl_add(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: cshl_add:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: cshl_add:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 31, v1
 ; GISEL-NEXT:    v_lshl_add_u32 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %b, 31
   %shl = shl i32 %a, %and
@@ -369,15 +369,15 @@ define i32 @cshl_add(i32 %a, i32 %b, i32 %c) {
 define i32 @add_cshl(i32 %a, i32 %b) {
 ; CHECK-LABEL: add_cshl:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_lshl_u32 v0, v0, v1, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: add_cshl:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v2, 31, v1
 ; GISEL-NEXT:    v_add_lshl_u32 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %b
   %and = and i32 %b, 31
diff --git a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
index 895185cb41a3974..94d578eef1bed35 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
@@ -8,7 +8,6 @@
   ; GFX908-LABEL: test_spill:
   ; GFX908:       ; %bb.0:
   ; GFX908-NEXT:    ; implicit-def: $agpr96_agpr97_agpr98_agpr99_agpr100_agpr101_agpr102_agpr103_agpr104_agpr105_agpr106_agpr107_agpr108_agpr109_agpr110_agpr111
-  ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GFX908-NEXT:    v_accvgpr_read_b32 v63, a96
   ; GFX908-NEXT:    ; implicit-def: $vgpr0
   ; GFX908-NEXT:    ; implicit-def: $vgpr1
@@ -218,7 +217,6 @@
   ; GFX90A-LABEL: test_spill:
   ; GFX90A:       ; %bb.0:
   ; GFX90A-NEXT:    ; implicit-def: $agpr96_agpr97_agpr98_agpr99_agpr100_agpr101_agpr102_agpr103_agpr104_agpr105_agpr106_agpr107_agpr108_agpr109_agpr110_agpr111
-  ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GFX90A-NEXT:    v_accvgpr_mov_b32 a64, a96
   ; GFX90A-NEXT:    v_accvgpr_mov_b32 a65, a97
   ; GFX90A-NEXT:    v_accvgpr_mov_b32 a66, a98
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index f97d15739be2965..d8ec4d1db84c713 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -26,9 +26,9 @@
 define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-LABEL: call_split_type_used_outside_block_v2f32:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -62,9 +62,9 @@ bb1:
 define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-LABEL: call_split_type_used_outside_block_v3f32:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -98,9 +98,9 @@ bb1:
 define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-LABEL: call_split_type_used_outside_block_v4f16:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -134,9 +134,9 @@ bb1:
 define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-LABEL: call_split_type_used_outside_block_struct:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
index 674dd00575e1444..c446f9f60d897cf 100644
--- a/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
+++ b/llvm/test/CodeGen/AMDGPU/cse-convergent.ll
@@ -4,7 +4,7 @@
 define i32 @test(i32 %val, i32 %cond) {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -45,7 +45,7 @@ define i32 @test(i32 %val, i32 %cond) {
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_mov_b32 exec_lo, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; %x = subgroup operation over all lanes.
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 901cbd4a5272059..af87cfe3d947752 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -11,26 +11,26 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_f32_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_f32_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_f32_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to float
@@ -40,26 +40,26 @@ define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_f32_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i32_to_f32_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i32_to_f32_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = sitofp i32 %masked to float
@@ -69,31 +69,31 @@ define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f32_lshr7_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f32_lshr7_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.7 = lshr i32 %arg0, 7
   %masked = and i32 %lshr.7, 255
@@ -104,26 +104,26 @@ define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f32_lshr8_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -132,51 +132,38 @@ define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
 }
 
 define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
-; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GCN-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX10-NEXT:    global_store_dword v[0:1], v1, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    global_store_dword v[0:1], v1, off
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   store i32 %lshr.8, ptr addrspace(1) undef
@@ -188,26 +175,26 @@ define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f32_lshr16_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -218,26 +205,26 @@ define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f32_lshr24_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f32_lshr24_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -248,26 +235,26 @@ define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
 define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_i8_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i8_to_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i8_to_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i8_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to float
   ret float %cvt
@@ -276,35 +263,35 @@ define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
 define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v2i8_to_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v2i8_to_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v2i8_to_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = bitcast i16 %arg0 to <2 x i8>
   %cvt = uitofp <2 x i8> %val to <2 x float>
@@ -314,39 +301,39 @@ define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
 define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v3i8_to_v3f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v3i8_to_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v3i8_to_v3f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc i32 %arg0 to i24
   %val = bitcast i24 %trunc to <3 x i8>
@@ -357,43 +344,43 @@ define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
 define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_v4i8_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_v4i8_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_v4i8_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = bitcast i32 %arg0 to <4 x i8>
   %cvt = uitofp <4 x i8> %val to <4 x float>
@@ -403,43 +390,43 @@ define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
 define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_unpack_i32_to_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mask.arg0 = and i32 %arg0, 255
   %cvt0 = uitofp i32 %mask.arg0 to float
@@ -466,39 +453,39 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
 define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i32_to_f16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i32_to_f16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_f16_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_f16_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to half
@@ -508,39 +495,39 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_sitofp_i32_to_f16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_sitofp_i32_to_f16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_sitofp_i32_to_f16_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_sitofp_i32_to_f16_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = sitofp i32 %masked to half
@@ -550,39 +537,39 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -593,39 +580,39 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -636,39 +623,39 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -679,36 +666,36 @@ define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
 define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i8_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i8_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i8_to_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i8_to_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i8_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to half
   ret half %cvt
@@ -717,31 +704,31 @@ define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
 define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i32_to_f64_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i32_to_f64_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i32_to_f64_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to double
@@ -751,31 +738,31 @@ define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f64_lshr8_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f64_lshr8_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
@@ -786,31 +773,31 @@ define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f64_lshr16_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f64_lshr16_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
@@ -821,31 +808,31 @@ define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
 ; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_to_f64_lshr24_mask255:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_to_f64_lshr24_mask255:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
@@ -856,42 +843,42 @@ define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
 define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
 ; SI-LABEL: v_uitofp_i8_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uitofp_i8_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uitofp_i8_to_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX10-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uitofp_i8_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_uitofp_i8_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to double
   ret double %cvt
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
index 8c4643fabb2a344..fcc356683dfa045 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
@@ -4,10 +4,9 @@
 define i32 @mullohi_u32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_u32:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
 ; CHECK-NEXT:    flat_store_dword v[2:3], v1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i32 %arg to i64
@@ -23,10 +22,9 @@ bb:
 define i32 @mullohi_s32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_s32:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v1, v0, 0
 ; CHECK-NEXT:    flat_store_dword v[2:3], v1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = sext i32 %arg to i64
@@ -42,11 +40,10 @@ bb:
 define i32 @mullohi_u32_non_const_shift(i32 %arg, i32 %arg1, ptr %arg2, i64 %shift) {
 ; CHECK-LABEL: mullohi_u32_non_const_shift:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v0, 0
 ; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[5:6]
 ; CHECK-NEXT:    flat_store_dword v[2:3], v6
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i32 %arg to i64
@@ -63,7 +60,6 @@ bb:
 define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_2xu32:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v1
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
 ; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, 0
@@ -71,7 +67,7 @@ define <2 x i32> @mullohi_2xu32(<2 x i32> %arg, <2 x i32> %arg1, ptr %arg2) {
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v3
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v2
 ; CHECK-NEXT:    flat_store_dwordx2 v[4:5], v[6:7]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext <2 x i32> %arg to <2 x i64>
@@ -87,11 +83,10 @@ bb:
 define i8 @mullohi_illegal_ty(i8 %arg, i8 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mullohi_illegal_ty:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; CHECK-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; CHECK-NEXT:    flat_store_byte v[2:3], v1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i8 %arg to i16
@@ -107,11 +102,10 @@ bb:
 define i32 @mul_one_bit_low_hi_u32(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_low_hi_u32:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
 ; CHECK-NEXT:    v_alignbit_b32 v0, v1, v0, 31
 ; CHECK-NEXT:    flat_store_dword v[2:3], v1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i32 %arg to i64
@@ -128,11 +122,10 @@ bb:
 define i32 @mul_one_bit_hi_hi_u32_lshr_lshr(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_lshr:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; CHECK-NEXT:    flat_store_dword v[2:3], v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i32 %arg to i64
@@ -149,11 +142,10 @@ bb:
 define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) {
 ; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v0
 ; CHECK-NEXT:    v_ashrrev_i64 v[0:1], 33, v[3:4]
 ; CHECK-NEXT:    flat_store_dword v[2:3], v4
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = zext i32 %arg to i64
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index fe649d433304178..1ef0e2e4bd192a5 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -239,19 +239,19 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
 define float @fmac_sequence_simple(float %a, float %b, float %c, float %d, float %e) #0 {
 ; GFX10-LABEL: fmac_sequence_simple:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmac_sequence_simple:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v2, v2, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %t0 = fmul fast float %a, %b
   %t1 = fmul fast float %c, %d
@@ -263,22 +263,22 @@ define float @fmac_sequence_simple(float %a, float %b, float %c, float %d, float
 define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %d, float %e, float %f, float %g) #0 {
 ; GFX10-LABEL: fmac_sequence_innermost_fmul:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v2, v2, v3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %t0 = fmul fast float %a, %b
   %t1 = fmul fast float %c, %d
@@ -292,22 +292,22 @@ define float @fmac_sequence_innermost_fmul(float %a, float %b, float %c, float %
 define float @fmac_sequence_innermost_fmul_swapped_operands(float %a, float %b, float %c, float %d, float %e, float %f, float %g) #0 {
 ; GFX10-LABEL: fmac_sequence_innermost_fmul_swapped_operands:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_f32 v2, v2, v3, v6
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmac_sequence_innermost_fmul_swapped_operands:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f32 v2, v2, v3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v4, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %t0 = fmul fast float %a, %b
   %t1 = fmul fast float %c, %d
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
index ba53c45356c0268..047a60f5449a867 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll
@@ -4,10 +4,10 @@
 define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
 ; CHECK-LABEL: wombat:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %then
@@ -17,7 +17,7 @@ define void @wombat(i1 %cond, ptr addrspace(5) %addr) {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_byte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load <1 x i8>, ptr addrspace(5) %addr, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index e25a4b301537f75..0b1956cea3f42ad 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -62,26 +62,26 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 define i32 @divergent_vec_0_i16(i16 %a) {
 ; GCN-LABEL: divergent_vec_0_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_0_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_0_i16:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
@@ -147,26 +147,26 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
 define i32 @divergent_vec_i16_0(i16 %a) {
 ; GCN-LABEL: divergent_vec_i16_0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_i16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_i16_0:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_i16_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
@@ -232,26 +232,26 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
 define float @divergent_vec_f16_0(half %a) {
 ; GCN-LABEL: divergent_vec_f16_0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_f16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_f16_0:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_f16_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
@@ -327,30 +327,30 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
 ; GCN-LABEL: divergent_vec_i16_LL:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_i16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_i16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_i16_LL:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
@@ -417,29 +417,29 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
 ; GCN-LABEL: divergent_vec_i16_LH:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, 0xffff
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_i16_LH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_i16_LH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_i16_LH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
@@ -509,29 +509,29 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
 ; GCN-LABEL: divergent_vec_i16_HH:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_i16_HH:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_i16_HH:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_i16_HH:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
@@ -614,31 +614,31 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 define float @divergent_vec_f16_LL(half %a, half %b) {
 ; GCN-LABEL: divergent_vec_f16_LL:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: divergent_vec_f16_LL:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: divergent_vec_f16_LL:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: divergent_vec_f16_LL:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half %b, i32 1
@@ -649,31 +649,27 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
 define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
 ; GCN-LABEL: build_vec_v2i16_undeflo_divergent:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    ds_read_u16 v0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: build_vec_v2i16_undeflo_divergent:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16_d16 v0, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: build_vec_v2i16_undeflo_divergent:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: build_vec_v2i16_undeflo_divergent:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16_d16 v0, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index f4d985833eb9adf..d86b53c415256c3 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -15,9 +15,9 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    .cfi_sections .debug_frame
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index c744ace37a8315e..bcdba095d541722 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -7,7 +7,6 @@
 define i32 @s_add_co_select_user() {
 ; GFX7-LABEL: s_add_co_select_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -22,11 +21,11 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: s_add_co_select_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -40,11 +39,11 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: s_add_co_select_user:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -57,11 +56,11 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_cmp_gt_u32 s4, 31
 ; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: s_add_co_select_user:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -75,6 +74,7 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
 ; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = load volatile i32, ptr addrspace(4) null, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
index 72ee660dc2adb8b..b91473c9ad30ba8 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-load-i1.ll
@@ -6,7 +6,6 @@
 define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-LABEL: extractloadi1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_ubyte v0, v[0:1]
 ; CHECK-NEXT:    v_and_b32_e32 v1, 7, v2
 ; CHECK-NEXT:    v_lshr_b32_e64 v2, s32, 6
@@ -28,7 +27,7 @@ define i1 @extractloadi1(ptr %ptr, i32 %idx) {
 ; CHECK-NEXT:    buffer_store_byte v3, off, s[0:3], s32 offset:2
 ; CHECK-NEXT:    buffer_store_byte v2, off, s[0:3], s32 offset:1
 ; CHECK-NEXT:    buffer_load_ubyte v0, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %val = load <8 x i1>, ptr %ptr
   %ret = extractelement <8 x i1> %val, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 9677ec41ce268a5..e2d4203e310fa1e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -19,8 +19,8 @@
 define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_a0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %onebit = shl i32 1, %numlowbits
   %mask = add nsw i32 %onebit, -1
@@ -31,8 +31,8 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_a1_indexzext:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %conv = zext i8 %numlowbits to i32
   %onebit = shl i32 1, %conv
@@ -44,8 +44,8 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_a4_commutative:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %onebit = shl i32 1, %numlowbits
   %mask = add nsw i32 %onebit, -1
@@ -60,8 +60,8 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_b0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = shl i32 -1, %numlowbits
   %mask = xor i32 %notmask, -1
@@ -72,8 +72,8 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_b1_indexzext:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %conv = zext i8 %numlowbits to i32
   %notmask = shl i32 -1, %conv
@@ -85,8 +85,8 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_b4_commutative:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = shl i32 -1, %numlowbits
   %mask = xor i32 %notmask, -1
@@ -101,8 +101,8 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_c0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -113,18 +113,18 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI-LABEL: bzhi32_c1_indexzext:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
 ; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
 ; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c1_indexzext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v1, 32, v1
 ; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
 ; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
@@ -136,8 +136,8 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_c4_commutative:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -152,8 +152,8 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; GCN-LABEL: bzhi32_d0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %highbitscleared = shl i32 %val, %numhighbits
@@ -164,18 +164,18 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI-LABEL: bzhi32_d1_indexzext:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_d1_indexzext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v1, 32, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 4202edfbd0eb457..40b00403615a1dd 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -6,7 +6,6 @@
 define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xi16_extract_4xi16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB0_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -33,14 +32,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB0_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB0_3
 ; SI-NEXT:    s_branch .LBB0_4
 ; SI-NEXT:  .LBB0_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB0_4
 ; SI-NEXT:  .LBB0_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -86,11 +92,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
@@ -114,11 +120,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_8xi16_extract_4xi16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
@@ -145,6 +151,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -167,7 +174,6 @@ exit:
 define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xi16_extract_4xi16_2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB1_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -194,15 +200,22 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB1_3
 ; SI-NEXT:    s_branch .LBB1_4
 ; SI-NEXT:  .LBB1_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB1_4
 ; SI-NEXT:  .LBB1_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -249,11 +262,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
@@ -277,11 +290,11 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_8xi16_extract_4xi16_2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
@@ -308,6 +321,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -330,7 +344,6 @@ exit:
 define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xf16_extract_4xf16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB2_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -360,14 +373,21 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB2_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB2_3
 ; SI-NEXT:    s_branch .LBB2_4
 ; SI-NEXT:  .LBB2_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB2_4
 ; SI-NEXT:  .LBB2_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -412,11 +432,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v3, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_8xf16_extract_4xf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
@@ -443,11 +463,11 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v5
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_8xf16_extract_4xf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
@@ -475,6 +495,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -498,7 +519,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ;
 ; SI-LABEL: vec_16xi16_extract_4xi16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB3_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -541,14 +561,21 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB3_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB3_3
 ; SI-NEXT:    s_branch .LBB3_4
 ; SI-NEXT:  .LBB3_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB3_4
 ; SI-NEXT:  .LBB3_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -610,11 +637,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB3_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
@@ -644,11 +671,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_16xi16_extract_4xi16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
@@ -679,6 +706,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -702,7 +730,6 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ;
 ; SI-LABEL: vec_16xi16_extract_4xi16_2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB4_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -745,15 +772,22 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v7, v2
 ; SI-NEXT:    v_or_b32_e32 v3, v6, v3
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB4_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB4_3
 ; SI-NEXT:    s_branch .LBB4_4
 ; SI-NEXT:  .LBB4_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB4_4
 ; SI-NEXT:  .LBB4_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -816,11 +850,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB4_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
@@ -850,11 +884,11 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_16xi16_extract_4xi16_2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB4_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
@@ -885,6 +919,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -908,7 +943,6 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ;
 ; SI-LABEL: vec_16xf16_extract_4xf16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_2
 ; SI-NEXT:  ; %bb.1: ; %F
 ; SI-NEXT:    s_mov_b32 s6, 0
@@ -954,14 +988,21 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB5_3
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB5_3
 ; SI-NEXT:    s_branch .LBB5_4
 ; SI-NEXT:  .LBB5_2:
+; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB5_4
 ; SI-NEXT:  .LBB5_3: ; %T
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1022,11 +1063,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v3, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xf16_extract_4xf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %F
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
@@ -1059,11 +1100,11 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v4
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_16xf16_extract_4xf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB5_2
 ; GFX11-NEXT:  ; %bb.1: ; %F
 ; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
@@ -1095,6 +1136,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -1117,7 +1159,6 @@ exit:
 define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
 ; SI-LABEL: large_vector:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 5, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
@@ -1128,32 +1169,28 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
 ; SI-NEXT:    ds_read_b32 v2, v1
 ; SI-NEXT:    ds_read_b32 v4, v3
 ; SI-NEXT:    ds_read_b32 v6, v5
-; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT:    s_waitcnt lgkmcnt(2)
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_waitcnt lgkmcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: large_vector:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v1, 5, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: large_vector:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_add_u32 v2, v1, 5, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %idx = shl i32 %idxp, 4
 
@@ -1202,7 +1239,6 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
 define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_16xi16_extract_8xi16_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 1, v4
@@ -1254,10 +1290,14 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v12
 ; SI-NEXT:    v_or_b32_e32 v2, v10, v13
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v14
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB7_3
+; SI-NEXT:    s_mov_b64 s[34:35], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB7_3
 ; SI-NEXT:    s_branch .LBB7_4
 ; SI-NEXT:  .LBB7_2:
+; SI-NEXT:    s_mov_b64 s[34:35], -1
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr2
@@ -1266,7 +1306,10 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB7_4
 ; SI-NEXT:  .LBB7_3: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
@@ -1349,11 +1392,11 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_alignbit_b32 v5, v6, v12, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
@@ -1403,11 +1446,11 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX9-NEXT:    v_perm_b32 v1, v5, v8, s34
 ; GFX9-NEXT:    v_perm_b32 v2, v6, v2, s34
 ; GFX9-NEXT:    v_perm_b32 v3, v7, v3, s34
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_16xi16_extract_8xi16_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_u8 v4, off, s32
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1461,6 +1504,7 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
@@ -1483,7 +1527,6 @@ exit:
 define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_16xf16_extract_8xf16_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 1, v4
@@ -1543,10 +1586,14 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v11
-; SI-NEXT:    s_mov_b64 vcc, exec
-; SI-NEXT:    s_cbranch_execz .LBB8_3
+; SI-NEXT:    s_mov_b64 s[34:35], 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccz .LBB8_3
 ; SI-NEXT:    s_branch .LBB8_4
 ; SI-NEXT:  .LBB8_2:
+; SI-NEXT:    s_mov_b64 s[34:35], -1
 ; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr8
@@ -1555,7 +1602,10 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 vcc, vcc
+; SI-NEXT:    s_cbranch_vccnz .LBB8_4
 ; SI-NEXT:  .LBB8_3: ; %T
 ; SI-NEXT:    s_mov_b32 s39, 0xf000
 ; SI-NEXT:    s_mov_b32 s36, s38
@@ -1643,11 +1693,11 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc
 ; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v8, v9, vcc
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: vec_16xf16_extract_8xf16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
@@ -1695,11 +1745,11 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v9, v5
 ; GFX9-NEXT:    v_pack_b32_f16 v2, v8, v6
 ; GFX9-NEXT:    v_pack_b32_f16 v3, v3, v7
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_16xf16_extract_8xf16_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_u8 v4, off, s32
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1753,6 +1803,7 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v3, v5, v6
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
index d3b5d45470eb7d7..efddeca2470348e 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-equal-length.ll
@@ -7,10 +7,10 @@
 define <3 x i32> @quux() {
 ; CHECK-LABEL: quux:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = shufflevector <4 x i8> <i8 1, i8 2, i8 3, i8 4>, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 2456da621877377..908ca72c5d8a397 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -9,7 +9,6 @@
 define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX7-ALIGNED-NEXT:    flat_load_ushort v2, v[2:3]
@@ -18,34 +17,31 @@ define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-ALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_load_2xi16_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_load_2xi16_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_load_2xi16_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 2
@@ -122,7 +118,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
@@ -143,34 +138,31 @@ define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-ALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_load_2xi16_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_load_2xi16_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_load_2xi16_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 1
@@ -258,37 +250,32 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
 ; GFX7-ALIGNED-LABEL: global_load_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_load_2xi16_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_load_2xi16_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_load_2xi16_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
   %p.0 = load i16, ptr addrspace(1) %p, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
index 08e36413322b647..87475008fb9ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -12,7 +12,6 @@
 define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
@@ -20,55 +19,49 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-ALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_load_2xi16_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_load_2xi16_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 2
@@ -84,65 +77,61 @@ define i32 @private_load_2xi16_align2(ptr addrspace(5) %p) #0 {
 define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-ALIGNED-LABEL: private_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
 ; GFX7-ALIGNED-NEXT:    buffer_store_short v3, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_store_2xi16_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_store_2xi16_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_store_2xi16_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_store_2xi16_align2:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 2
@@ -154,7 +143,6 @@ define void @private_store_2xi16_align2(ptr addrspace(5) %p, ptr addrspace(5) %r
 define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v1, vcc, 2, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v3, vcc, 3, v0
@@ -172,55 +160,49 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX7-ALIGNED-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_load_2xi16_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_load_2xi16_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 1
@@ -236,7 +218,6 @@ define i32 @private_load_2xi16_align1(ptr addrspace(5) %p) #0 {
 define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-ALIGNED-LABEL: private_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v1
 ; GFX7-ALIGNED-NEXT:    buffer_store_byte v3, v1, s[0:3], 0 offen
@@ -247,59 +228,56 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
 ; GFX7-ALIGNED-NEXT:    buffer_store_byte v4, v3, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_store_byte v4, v1, s[0:3], 0 offen
 ; GFX7-ALIGNED-NEXT:    buffer_store_byte v0, v2, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_store_2xi16_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_store_2xi16_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_store_2xi16_align1:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_store_2xi16_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_store_2xi16_align1:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 1
@@ -311,58 +289,50 @@ define void @private_store_2xi16_align1(ptr addrspace(5) %p, ptr addrspace(5) %r
 define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
 ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_load_2xi16_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_load_2xi16_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_load_2xi16_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    scratch_load_b32 v0, v0, off
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
   %p.0 = load i16, ptr addrspace(5) %p, align 4
@@ -378,62 +348,58 @@ define i32 @private_load_2xi16_align4(ptr addrspace(5) %p) #0 {
 define void @private_store_2xi16_align4(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
 ; GFX7-ALIGNED-LABEL: private_store_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX7-ALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: private_store_2xi16_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4:
 ; GFX9-FLASTSCR:       ; %bb.0:
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
-; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_store_2xi16_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-FLASTSCR-LABEL: private_store_2xi16_align4:
 ; GFX10-FLASTSCR:       ; %bb.0:
-; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
+; GFX10-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_store_2xi16_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FLASTSCR-LABEL: private_store_2xi16_align4:
 ; GFX11-FLASTSCR:       ; %bb.0:
-; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX11-FLASTSCR-NEXT:    scratch_store_b32 v1, v0, off
+; GFX11-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, ptr addrspace(5) %r, i64 1
   store i16 1, ptr addrspace(5) %r, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 27c42a9ea0db60a..6b2e2175194078a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -173,35 +173,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
 ; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ins0 = insertelement <2 x half> undef, half %lo, i32 0
   %ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
@@ -2284,36 +2284,36 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac
 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v3f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v3f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
   ret <3 x half> %canonicalized
@@ -2322,25 +2322,24 @@ define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
 define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2349,13 +2348,14 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
   ret <4 x half> %canonicalized
@@ -2407,32 +2407,32 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out
 define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 ; VI-LABEL: v_test_canonicalize_reg_undef_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_reg_undef_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_reg_undef_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2442,31 +2442,31 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
 ; VI-LABEL: v_test_canonicalize_undef_reg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_undef_reg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_undef_reg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2476,27 +2476,27 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
 define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfrev_b32_e32 v0, 60
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 60
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 1.0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfrev_b32_e32 v0, 60
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2506,27 +2506,27 @@ define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
 define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 1.0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 1.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2536,27 +2536,27 @@ define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
 define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfrev_b32_e32 v0, 50
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfrev_b32_e32 v0, 50
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_undef_lo_k_hi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfrev_b32_e32 v0, 50
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2566,27 +2566,27 @@ define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
 define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
 ; VI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4c00
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4c00
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x41800000
 ; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_k_lo_undef_hi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4c00
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half 16.0, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
@@ -2596,32 +2596,32 @@ define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
 define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 ; VI-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, 2.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 2.0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 2.0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 2.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half %val, i32 0
   %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
@@ -2632,32 +2632,32 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
 ; VI-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, 0x4000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, 2.0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 2.0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, 2.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
   %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
@@ -2716,38 +2716,38 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
 define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
 ; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <4 x half> undef, half %val, i32 0
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
@@ -2757,25 +2757,24 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
 define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
 ; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
@@ -2784,15 +2783,16 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <4 x half> undef, half %val0, i32 0
   %vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
@@ -2803,27 +2803,26 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
 define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
 ; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v1, v1, v1
 ; VI-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7e000000, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2834,16 +2833,17 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 ; CI-NEXT:    v_mul_f32_e32 v2, 1.0, v1
 ; CI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <4 x half> undef, half %val0, i32 0
   %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
@@ -2855,7 +2855,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v6f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2865,19 +2864,19 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v5
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v6f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v6f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2890,14 +2889,15 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v6f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <6 x half> @llvm.canonicalize.v6f16(<6 x half> %val)
   ret <6 x half> %canonicalized
@@ -2906,7 +2906,6 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v8f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2919,20 +2918,20 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v6
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v5
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v8f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v8f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2949,15 +2948,16 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v8f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val)
   ret <8 x half> %canonicalized
@@ -2966,7 +2966,6 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v12f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2985,22 +2984,22 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v8
 ; VI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v6
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v12f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX9-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v12f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3025,17 +3024,18 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v12f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <12 x half> @llvm.canonicalize.v12f16(<12 x half> %val)
   ret <12 x half> %canonicalized
@@ -3044,7 +3044,6 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v16f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3069,11 +3068,11 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v10
 ; VI-NEXT:    v_or_b32_e32 v6, v6, v9
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v16f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
@@ -3082,11 +3081,11 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; GFX9-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX9-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX9-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v16f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3119,11 +3118,11 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v16f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
@@ -3132,6 +3131,7 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; GFX11-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> %val)
   ret <16 x half> %canonicalized
@@ -3140,7 +3140,6 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v32f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v20, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v20
@@ -3189,11 +3188,11 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; VI-NEXT:    v_or_b32_e32 v13, v13, v18
 ; VI-NEXT:    v_or_b32_e32 v14, v14, v17
 ; VI-NEXT:    v_or_b32_e32 v15, v15, v16
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v32f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT:    v_pk_max_f16 v2, v2, v2
@@ -3210,11 +3209,11 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; GFX9-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX9-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX9-NEXT:    v_pk_max_f16 v15, v15, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v32f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -3281,11 +3280,11 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v32f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-NEXT:    v_pk_max_f16 v2, v2, v2
@@ -3302,6 +3301,7 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; GFX11-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-NEXT:    v_pk_max_f16 v15, v15, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <32 x half> @llvm.canonicalize.v32f16(<32 x half> %val)
   ret <32 x half> %canonicalized
@@ -3310,7 +3310,6 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; VI-LABEL: v_test_canonicalize_var_v64f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v31
@@ -3409,11 +3408,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; VI-NEXT:    v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v31, v31, v31
 ; VI-NEXT:    v_or_b32_e32 v31, v31, v32
+; VI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_var_v64f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
@@ -3448,11 +3447,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; GFX9-NEXT:    v_pk_max_f16 v30, v30, v30
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v31, v31, v31
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_test_canonicalize_var_v64f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -3709,12 +3708,11 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
 ; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; CI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v64f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
@@ -3749,6 +3747,7 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; GFX11-NEXT:    v_pk_max_f16 v30, v30, v30
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v31, v31, v31
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonicalized = call <64 x half> @llvm.canonicalize.v64f16(<64 x half> %val)
   ret <64 x half> %canonicalized
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 41b436473f65211..d2d001f27cf6f32 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2355,22 +2355,22 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v2f32_flush:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v2f32_flush:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v2f32_flush:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg)
   ret <2 x float> %canon
@@ -2380,25 +2380,25 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
 define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v3f32_flush:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v3f32_flush:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v3f32_flush:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg)
   ret <3 x float> %canon
@@ -2408,27 +2408,27 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
 define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v4f32_flush:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX678-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v4f32_flush:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v4f32_flush:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg)
   ret <4 x float> %canon
@@ -2438,7 +2438,6 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
 define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v8f32_flush:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v2, 1.0, v2
@@ -2447,11 +2446,11 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
 ; GFX678-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX678-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX678-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v8f32_flush:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -2460,15 +2459,16 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
 ; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
 ; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v8f32_flush:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
 ; GFX11-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg)
   ret <8 x float> %canon
@@ -2477,23 +2477,23 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
 define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v2f64:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg)
   ret <2 x double> %canon
@@ -2502,26 +2502,26 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
 define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v3f64:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX678-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v3f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v3f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg)
   ret <3 x double> %canon
@@ -2530,29 +2530,29 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
 define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
 ; GFX678-LABEL: v_test_canonicalize_v4f64:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX678-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX678-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX678-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_canonicalize_v4f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_v4f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg)
   ret <4 x double> %canon
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 667c561ea26f6f6..d1cce4d0307a5e5 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -645,31 +645,31 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
 define half @v_copysign_f16(half %mag, half %sign) {
 ; SI-LABEL: v_copysign_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_brev_b32 s4, -2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_movk_i32 s4, 0x7fff
 ; VI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half %sign)
   ret half %result
@@ -678,27 +678,27 @@ define half @v_copysign_f16(half %mag, half %sign) {
 define half @v_test_copysign_f16_0(half %mag) {
 ; SI-LABEL: v_test_copysign_f16_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f16_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_copysign_f16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f16_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half 0.0)
   ret half %result
@@ -707,27 +707,27 @@ define half @v_test_copysign_f16_0(half %mag) {
 define half @v_test_copysign_f16_1(half %mag) {
 ; SI-LABEL: v_test_copysign_f16_1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f16_1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_copysign_f16_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f16_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half 1.0)
   ret half %result
@@ -736,27 +736,27 @@ define half @v_test_copysign_f16_1(half %mag) {
 define half @v_test_copysign_f16_10(half %mag) {
 ; SI-LABEL: v_test_copysign_f16_10:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f16_10:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_copysign_f16_10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f16_10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half 10.0)
   ret half %result
@@ -765,27 +765,27 @@ define half @v_test_copysign_f16_10(half %mag) {
 define half @v_test_copysign_f16_neg1(half %mag) {
 ; SI-LABEL: v_test_copysign_f16_neg1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f16_neg1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_copysign_f16_neg1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f16_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half -1.0)
   ret half %result
@@ -794,27 +794,27 @@ define half @v_test_copysign_f16_neg1(half %mag) {
 define half @v_test_copysign_f16_neg10(half %mag) {
 ; SI-LABEL: v_test_copysign_f16_neg10:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f16_neg10:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_test_copysign_f16_neg10:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f16_neg10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half %mag, half -10.0)
   ret half %result
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index e62dfe100642bca..bbbc7c4ab47f9e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -685,15 +685,15 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
 define float @v_test_copysign_f32(float %mag, float %sign) {
 ; SIVI-LABEL: v_test_copysign_f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_bfi_b32 v0, s4, v0, v1
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   ret float %result
@@ -702,14 +702,14 @@ define float @v_test_copysign_f32(float %mag, float %sign) {
 define float @v_test_copysign_f32_0(float %mag) {
 ; SIVI-LABEL: v_test_copysign_f32_0:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float 0.0)
   ret float %result
@@ -718,14 +718,14 @@ define float @v_test_copysign_f32_0(float %mag) {
 define float @v_test_copysign_f32_1(float %mag) {
 ; SIVI-LABEL: v_test_copysign_f32_1:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float 1.0)
   ret float %result
@@ -734,14 +734,14 @@ define float @v_test_copysign_f32_1(float %mag) {
 define float @v_test_copysign_f32_10(float %mag) {
 ; SIVI-LABEL: v_test_copysign_f32_10:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32_10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float 10.0)
   ret float %result
@@ -750,14 +750,14 @@ define float @v_test_copysign_f32_10(float %mag) {
 define float @v_test_copysign_f32_neg1(float %mag) {
 ; SIVI-LABEL: v_test_copysign_f32_neg1:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float -1.0)
   ret float %result
@@ -766,14 +766,14 @@ define float @v_test_copysign_f32_neg1(float %mag) {
 define float @v_test_copysign_f32_neg10(float %mag) {
 ; SIVI-LABEL: v_test_copysign_f32_neg10:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f32_neg10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float %mag, float -10.0)
   ret float %result
@@ -782,17 +782,17 @@ define float @v_test_copysign_f32_neg10(float %mag) {
 define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) {
 ; SIVI-LABEL: v_test_copysign_v2f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v1, v3
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v2
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   ret <2 x float> %result
@@ -801,16 +801,16 @@ define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) {
 define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) {
 ; SIVI-LABEL: v_test_copysign_v2f32_0:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v2f32_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> zeroinitializer)
   ret <2 x float> %result
@@ -819,16 +819,16 @@ define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) {
 define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) {
 ; SIVI-LABEL: v_test_copysign_v2f32_neg1:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
 ; SIVI-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v2f32_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> <float -1.0, float -1.0>)
   ret <2 x float> %result
@@ -837,19 +837,19 @@ define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) {
 define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) {
 ; SIVI-LABEL: v_test_copysign_v3f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_bfi_b32 v0, s4, v0, v3
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v1, v4
 ; SIVI-NEXT:    v_bfi_b32 v2, s4, v2, v5
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v3f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v3
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v4
 ; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
   ret <3 x float> %result
@@ -858,21 +858,21 @@ define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) {
 define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) {
 ; SIVI-LABEL: v_test_copysign_v4f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_bfi_b32 v0, s4, v0, v4
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v1, v5
 ; SIVI-NEXT:    v_bfi_b32 v2, s4, v2, v6
 ; SIVI-NEXT:    v_bfi_b32 v3, s4, v3, v7
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v4
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v5
 ; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, v6
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   ret <4 x float> %result
@@ -881,23 +881,23 @@ define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) {
 define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) {
 ; SIVI-LABEL: v_test_copysign_v5f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_bfi_b32 v0, s4, v0, v5
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v1, v6
 ; SIVI-NEXT:    v_bfi_b32 v2, s4, v2, v7
 ; SIVI-NEXT:    v_bfi_b32 v3, s4, v3, v8
 ; SIVI-NEXT:    v_bfi_b32 v4, s4, v4, v9
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v5f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v5
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, v6
 ; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, v7
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, v8
 ; GFX11-NEXT:    v_bfi_b32 v4, 0x7fffffff, v4, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <5 x float> @llvm.copysign.v5f32(<5 x float> %mag, <5 x float> %sign)
   ret <5 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 8d97f102799290f..3a3aade585fd969 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -861,17 +861,17 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
 define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) {
 ; SIVI-LABEL: v_test_copysign_f64:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v11, v21
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v21
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   ret double %result
@@ -880,15 +880,15 @@ define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag
 define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) {
 ; SIVI-LABEL: v_test_copysign_f64_0:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double 0.0)
   ret double %result
@@ -897,15 +897,15 @@ define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %m
 define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) {
 ; SIVI-LABEL: v_test_copysign_f64_1:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double 1.0)
   ret double %result
@@ -914,15 +914,15 @@ define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %m
 define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) {
 ; SIVI-LABEL: v_test_copysign_f64_10:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v11
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double 10.0)
   ret double %result
@@ -931,16 +931,16 @@ define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %
 define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) {
 ; SIVI-LABEL: v_test_copysign_f64_neg1:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_neg1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double -1.0)
   ret double %result
@@ -949,16 +949,16 @@ define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double
 define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) {
 ; SIVI-LABEL: v_test_copysign_f64_neg10:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_neg10:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.copysign.f64(double %mag, double -10.0)
   ret double %result
@@ -967,17 +967,17 @@ define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], doubl
 define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) {
 ; SIVI-LABEL: v_test_copysign_f64_f32:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v10
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v11, v20
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v20
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign.ext = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
@@ -987,27 +987,27 @@ define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double
 define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) {
 ; SI-LABEL: v_test_copysign_f64_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_brev_b32 s4, -2
 ; SI-NEXT:    v_mov_b32_e32 v0, v10
 ; SI-NEXT:    v_bfi_b32 v1, s4, v11, v20
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_copysign_f64_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
 ; VI-NEXT:    s_brev_b32 s4, -2
 ; VI-NEXT:    v_mov_b32_e32 v0, v10
 ; VI-NEXT:    v_bfi_b32 v1, s4, v11, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_f64_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v11, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign.ext = fpext half %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %sign.ext)
@@ -1017,21 +1017,21 @@ define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double
 define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) {
 ; SIVI-LABEL: v_test_copysign_v2f64:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v2
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v7
 ; SIVI-NEXT:    v_bfi_b32 v3, s4, v5, v9
 ; SIVI-NEXT:    v_mov_b32_e32 v2, v4
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v7
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v9
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   ret <2 x double> %result
@@ -1040,7 +1040,6 @@ define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %
 define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) {
 ; SIVI-LABEL: v_test_copysign_v3f64:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v2
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v9
@@ -1048,17 +1047,18 @@ define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %
 ; SIVI-NEXT:    v_bfi_b32 v5, s4, v7, v13
 ; SIVI-NEXT:    v_mov_b32_e32 v2, v4
 ; SIVI-NEXT:    v_mov_b32_e32 v4, v6
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v3f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v9
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v11
 ; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, v7, v13
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign)
   ret <3 x double> %result
@@ -1067,7 +1067,6 @@ define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %
 define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) {
 ; SIVI-LABEL: v_test_copysign_v4f64:
 ; SIVI:       ; %bb.0:
-; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_brev_b32 s4, -2
 ; SIVI-NEXT:    v_mov_b32_e32 v0, v2
 ; SIVI-NEXT:    v_bfi_b32 v1, s4, v3, v11
@@ -1077,11 +1076,11 @@ define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; SIVI-NEXT:    v_mov_b32_e32 v2, v4
 ; SIVI-NEXT:    v_mov_b32_e32 v4, v6
 ; SIVI-NEXT:    v_mov_b32_e32 v6, v8
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_copysign_v4f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fffffff, v3, v11
 ; GFX11-NEXT:    v_bfi_b32 v3, 0x7fffffff, v5, v13
@@ -1090,6 +1089,7 @@ define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v6
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   ret <4 x double> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 415b2a10233fec5..a7324d3e08d4aee 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -1649,7 +1649,6 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
 define half @v_fdiv_f16_arcp(half %x, half %y) {
 ; SI-LABEL: v_fdiv_f16_arcp:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1668,35 +1667,36 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; SI-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f16_arcp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_f16_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_arcp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp half %x, %y
   ret half %fdiv
@@ -1705,42 +1705,42 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
 define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
 ; SI-LABEL: v_fdiv_f16_afn_nsz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_rcp_f32_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f16_afn_nsz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX8-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fdiv_f16_afn_nsz:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f16_afn_nsz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_afn_nsz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn nsz half %x, %y
   ret half %fdiv
@@ -1832,7 +1832,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ;
 ; SI-LABEL: v_rsq_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -1870,41 +1869,42 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_rsq_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_rsq_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX9-IEEE-LABEL: v_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
@@ -2030,7 +2030,6 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ;
 ; SI-LABEL: v_neg_rsq_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -2068,44 +2067,45 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, -1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_neg_rsq_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rsq_f16_e32 v1, v0
 ; GFX8-NEXT:    v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX8-NEXT:    v_xor_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_neg_rsq_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index e9643aa65402bbe..a1436ca83ae143b 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -2413,7 +2413,6 @@ entry:
 define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2428,11 +2427,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2447,11 +2446,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2466,11 +2465,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2485,11 +2484,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2504,11 +2503,11 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_denorm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -2524,6 +2523,7 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_denorm:
@@ -2537,7 +2537,6 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
 define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -2549,11 +2548,11 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2565,11 +2564,11 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -2581,11 +2580,11 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2597,11 +2596,11 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -2613,11 +2612,11 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -2630,6 +2629,7 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee:
@@ -2643,7 +2643,6 @@ define float @v_fdiv_f32_ieee(float %x, float %y) #1 {
 define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -2657,11 +2656,11 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2670,11 +2669,11 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2683,11 +2682,11 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2696,11 +2695,11 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2710,6 +2709,7 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp:
@@ -2723,7 +2723,6 @@ define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 {
 define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2738,11 +2737,11 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2757,11 +2756,11 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2776,11 +2775,11 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2795,11 +2794,11 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -2814,11 +2813,11 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -2834,6 +2833,7 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic:
@@ -2847,7 +2847,6 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
 define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -2861,11 +2860,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2874,11 +2873,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2887,11 +2886,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2900,11 +2899,11 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -2914,6 +2913,7 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp:
@@ -2927,7 +2927,6 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 {
 define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2941,11 +2940,11 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2959,11 +2958,11 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -2977,11 +2976,11 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -2995,11 +2994,11 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -3013,11 +3012,11 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -3032,6 +3031,7 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz:
@@ -3045,7 +3045,6 @@ define float @v_fdiv_f32_daz(float %x, float %y) #0 {
 define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -3054,22 +3053,22 @@ define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
@@ -3077,6 +3076,7 @@ define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp:
@@ -3091,7 +3091,6 @@ define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 {
 define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -3104,11 +3103,11 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v3
@@ -3121,11 +3120,11 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -3138,11 +3137,11 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v3
@@ -3155,11 +3154,11 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX10-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -3172,11 +3171,11 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -3190,6 +3189,7 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_contractable_user:
@@ -3204,7 +3204,6 @@ define float @v_fdiv_f32_ieee_contractable_user(float %x, float %y, float %z) #1
 define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float %z) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -3219,11 +3218,11 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3233,11 +3232,11 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3247,11 +3246,11 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3261,11 +3260,11 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3276,6 +3275,7 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX11-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user:
@@ -3290,7 +3290,6 @@ define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float
 define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -3306,11 +3305,11 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3326,11 +3325,11 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -3346,11 +3345,11 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3366,11 +3365,11 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3386,11 +3385,11 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -3407,6 +3406,7 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_contractable_user:
@@ -3421,7 +3421,6 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
 define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, float %z) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -3436,11 +3435,11 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3450,11 +3449,11 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3464,11 +3463,11 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3478,11 +3477,11 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -3493,6 +3492,7 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX11-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user:
@@ -3507,7 +3507,6 @@ define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, flo
 define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -3522,11 +3521,11 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v3
@@ -3541,11 +3540,11 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -3560,11 +3559,11 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v3
@@ -3579,11 +3578,11 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3598,11 +3597,11 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
@@ -3618,6 +3617,7 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_contractable_user:
@@ -3632,7 +3632,6 @@ define float @v_fdiv_f32_daz_contractable_user(float %x, float %y, float %z) #0
 define float @v_fdiv_f32_daz_25ulp_contractable_user(float %x, float %y, float %z) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp_contractable_user:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v3, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -3641,22 +3640,22 @@ define float @v_fdiv_f32_daz_25ulp_contractable_user(float %x, float %y, float %
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
@@ -3664,6 +3663,7 @@ define float @v_fdiv_f32_daz_25ulp_contractable_user(float %x, float %y, float %
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_fma_f32 v0, v3, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp_contractable_user:
@@ -3678,7 +3678,6 @@ define float @v_fdiv_f32_daz_25ulp_contractable_user(float %x, float %y, float %
 define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -3690,11 +3689,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -3706,11 +3705,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -3722,11 +3721,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -3738,11 +3737,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -3754,11 +3753,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -3771,6 +3770,7 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee__nnan_ninf:
@@ -3784,7 +3784,6 @@ define float @v_fdiv_f32_ieee__nnan_ninf(float %x, float %y, float %z) #1 {
 define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -3798,11 +3797,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3811,11 +3810,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -3824,11 +3823,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -3837,11 +3836,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -3851,6 +3850,7 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf:
@@ -3864,7 +3864,6 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1
 define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -3879,11 +3878,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3898,11 +3897,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -3917,11 +3916,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3936,11 +3935,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -3955,11 +3954,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -3975,6 +3974,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf:
@@ -3988,7 +3988,6 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
 define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -4002,11 +4001,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4015,11 +4014,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4028,11 +4027,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4041,11 +4040,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -4055,6 +4054,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
@@ -4068,7 +4068,6 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -4082,11 +4081,11 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -4100,11 +4099,11 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -4118,11 +4117,11 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -4136,11 +4135,11 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -4154,11 +4153,11 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -4173,6 +4172,7 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz__nnan_ninf:
@@ -4186,7 +4186,6 @@ define float @v_fdiv_f32_daz__nnan_ninf(float %x, float %y, float %z) #0 {
 define float @v_fdiv_f32_daz_25ulp__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -4195,22 +4194,22 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
@@ -4218,6 +4217,7 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf(float %x, float %y, float %z) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf:
@@ -4231,7 +4231,6 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf(float %x, float %y, float %z) #0 {
 define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, float %z) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -4244,11 +4243,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v3
@@ -4261,11 +4260,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -4278,11 +4277,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v3
@@ -4295,11 +4294,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX10-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -4312,11 +4311,11 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -4330,6 +4329,7 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee__nnan_ninf_contractable_user:
@@ -4344,7 +4344,6 @@ define float @v_fdiv_f32_ieee__nnan_ninf_contractable_user(float %x, float %y, f
 define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -4359,11 +4358,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4373,11 +4372,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4387,11 +4386,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -4401,11 +4400,11 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -4416,6 +4415,7 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX11-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user:
@@ -4430,7 +4430,6 @@ define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float
 define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y, float %z) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -4446,11 +4445,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4466,11 +4465,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -4486,11 +4485,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4506,11 +4505,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -4526,11 +4525,11 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -4547,6 +4546,7 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user:
@@ -4561,7 +4561,6 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
 define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -4576,11 +4575,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4590,11 +4589,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4604,11 +4603,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -4618,11 +4617,11 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
@@ -4633,6 +4632,7 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX11-NEXT:    v_ldexp_f32 v0, v3, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
@@ -4647,7 +4647,6 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, float %z) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -4662,11 +4661,11 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v5, v3
@@ -4681,11 +4680,11 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX7-NEXT:    v_div_scale_f32 v5, vcc, v0, v1, v0
@@ -4700,11 +4699,11 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX7-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v5, v3
@@ -4719,11 +4718,11 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX8-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v3, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
@@ -4738,11 +4737,11 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX10-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v3
@@ -4758,6 +4757,7 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 ; GFX11-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v1, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz__nnan_ninf_contractable_user:
@@ -4772,7 +4772,6 @@ define float @v_fdiv_f32_daz__nnan_ninf_contractable_user(float %x, float %y, fl
 define float @v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v3, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -4781,22 +4780,22 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mad_f32 v0, v3, v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
@@ -4804,6 +4803,7 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user(float %x, float
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_fma_f32 v0, v3, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user:
@@ -4818,7 +4818,6 @@ define float @v_fdiv_f32_daz_25ulp__nnan_ninf_contractable_user(float %x, float
 define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -4830,11 +4829,11 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -4846,11 +4845,11 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -4862,11 +4861,11 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -4878,11 +4877,11 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, -v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -4894,11 +4893,11 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, -v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -4911,6 +4910,7 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_ieee:
@@ -4925,7 +4925,6 @@ define float @v_fdiv_neglhs_f32_ieee(float %x, float %y) #1 {
 define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -4939,11 +4938,11 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4952,11 +4951,11 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4965,11 +4964,11 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -4978,11 +4977,11 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -4992,6 +4991,7 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_ieee_25ulp:
@@ -5006,7 +5006,6 @@ define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 {
 define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
@@ -5021,11 +5020,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5040,11 +5039,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
@@ -5059,11 +5058,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5078,11 +5077,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, -v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5097,11 +5096,11 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_dynamic:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, -v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -5117,6 +5116,7 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_dynamic:
@@ -5131,7 +5131,6 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
 define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -5145,11 +5144,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5158,11 +5157,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5171,11 +5170,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -5184,11 +5183,11 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e64 v3, -v0
@@ -5198,6 +5197,7 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
@@ -5212,7 +5212,6 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
@@ -5226,11 +5225,11 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5244,11 +5243,11 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, -v0, v1, -v0
@@ -5262,11 +5261,11 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, -v0, v1, -v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5280,11 +5279,11 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, -v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -5298,11 +5297,11 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, -v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -5317,6 +5316,7 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_daz:
@@ -5331,7 +5331,6 @@ define float @v_fdiv_neglhs_f32_daz(float %x, float %y) #0 {
 define float @v_fdiv_neglhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-LABEL: v_fdiv_neglhs_f32_daz_25ulp:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -5340,22 +5339,22 @@ define float @v_fdiv_neglhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_neglhs_f32_daz_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_neglhs_f32_daz_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
@@ -5363,6 +5362,7 @@ define float @v_fdiv_neglhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_neglhs_f32_daz_25ulp:
@@ -5377,7 +5377,6 @@ define float @v_fdiv_neglhs_f32_daz_25ulp(float %x, float %y) #0 {
 define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -5389,11 +5388,11 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5405,11 +5404,11 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -5421,11 +5420,11 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5437,11 +5436,11 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, -v1, -v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -5453,11 +5452,11 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, -v1, -v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -5470,6 +5469,7 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_ieee:
@@ -5484,7 +5484,6 @@ define float @v_fdiv_negrhs_f32_ieee(float %x, float %y) #1 {
 define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
@@ -5498,11 +5497,11 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5511,11 +5510,11 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5524,11 +5523,11 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5537,11 +5536,11 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5551,6 +5550,7 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_ieee_25ulp:
@@ -5565,7 +5565,6 @@ define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 {
 define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
@@ -5580,11 +5579,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5599,11 +5598,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
@@ -5618,11 +5617,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5637,11 +5636,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, -v1, -v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, -v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -5656,11 +5655,11 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_dynamic:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, -v1, -v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, -v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -5676,6 +5675,7 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_dynamic:
@@ -5690,7 +5690,6 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
 define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s6, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
@@ -5704,11 +5703,11 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5717,11 +5716,11 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -5730,11 +5729,11 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5743,11 +5742,11 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e64 v2, -v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -5757,6 +5756,7 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp:
@@ -5771,7 +5771,6 @@ define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 {
 define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
@@ -5785,11 +5784,11 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5803,11 +5802,11 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, -v1, v0
@@ -5821,11 +5820,11 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, -v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -5839,11 +5838,11 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, -v1, -v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, -v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -5857,11 +5856,11 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, -v1, -v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, -v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -5876,6 +5875,7 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, -v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_daz:
@@ -5890,7 +5890,6 @@ define float @v_fdiv_negrhs_f32_daz(float %x, float %y) #0 {
 define float @v_fdiv_negrhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-LABEL: v_fdiv_negrhs_f32_daz_25ulp:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -5899,22 +5898,22 @@ define float @v_fdiv_negrhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_negrhs_f32_daz_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e64 v1, -v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_negrhs_f32_daz_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e64 v1, -v1, v2
@@ -5922,6 +5921,7 @@ define float @v_fdiv_negrhs_f32_daz_25ulp(float %x, float %y) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_negrhs_f32_daz_25ulp:
@@ -5936,7 +5936,6 @@ define float @v_fdiv_negrhs_f32_daz_25ulp(float %x, float %y) #0 {
 define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -5949,11 +5948,11 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -5966,11 +5965,11 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -5983,11 +5982,11 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -6000,11 +5999,11 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
@@ -6016,11 +6015,11 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -6033,6 +6032,7 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 ; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_ieee:
@@ -6046,7 +6046,6 @@ define float @v_fdiv_f32_constrhs0_ieee(float %x) #1 {
 define float @v_fdiv_f32_constrhs0_ieee_25ulp(float %x) #1 {
 ; GFX6-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -6056,45 +6055,46 @@ define float @v_fdiv_f32_constrhs0_ieee_25ulp(float %x) #1 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x3fa9e0f0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fa9e0f0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, -14, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fa9e0f0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -14, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fa9e0f0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, -14, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fa9e0f0, v1 :: v_dual_add_nc_u32 v0, -14, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp:
@@ -6108,7 +6108,6 @@ define float @v_fdiv_f32_constrhs0_ieee_25ulp(float %x) #1 {
 define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6124,11 +6123,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -6144,11 +6143,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6164,11 +6163,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -6184,11 +6183,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0
 ; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -6203,11 +6202,11 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
 ; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -6223,6 +6222,7 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic:
@@ -6236,7 +6236,6 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
 define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #2 {
 ; GFX6-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -6246,45 +6245,46 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #2 {
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x3fa9e0f0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 0x3fa9e0f0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, -14, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x3fa9e0f0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -14, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fa9e0f0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, -14, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_dual_mul_f32 v1, 0x3fa9e0f0, v1 :: v_dual_add_nc_u32 v0, -14, v0
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp:
@@ -6298,7 +6298,6 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #2 {
 define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6313,11 +6312,11 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -6332,11 +6331,11 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6351,11 +6350,11 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
@@ -6370,11 +6369,11 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0
 ; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6388,11 +6387,11 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constrhs0_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0
 ; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6407,6 +6406,7 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, 0x4640e400, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_daz:
@@ -6420,8 +6420,8 @@ define float @v_fdiv_f32_constrhs0_daz(float %x) #0 {
 define float @v_fdiv_f32_constrhs0_daz_25ulp(float %x) #0 {
 ; GCN-LABEL: v_fdiv_f32_constrhs0_daz_25ulp:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x38a9e0f0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constrhs0_daz_25ulp:
@@ -6435,7 +6435,6 @@ define float @v_fdiv_f32_constrhs0_daz_25ulp(float %x) #0 {
 define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6448,11 +6447,11 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6465,11 +6464,11 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6482,11 +6481,11 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6499,11 +6498,11 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x4640e400
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
@@ -6515,11 +6514,11 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x4640e400
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -6532,6 +6531,7 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 ; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_ieee:
@@ -6545,7 +6545,6 @@ define float @v_fdiv_f32_constlhs0_ieee(float %x) #1 {
 define float @v_fdiv_f32_constlhs0_ieee_25ulp(float %x) #1 {
 ; GFX6-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -6556,44 +6555,44 @@ define float @v_fdiv_f32_constlhs0_ieee_25ulp(float %x) #1 {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, 14, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 14, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 14, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
@@ -6601,6 +6600,7 @@ define float @v_fdiv_f32_constlhs0_ieee_25ulp(float %x) #1 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp:
@@ -6614,7 +6614,6 @@ define float @v_fdiv_f32_constlhs0_ieee_25ulp(float %x) #1 {
 define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6630,11 +6629,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6650,11 +6649,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6670,11 +6669,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6690,11 +6689,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x4640e400
 ; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -6709,11 +6708,11 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x4640e400
 ; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -6729,6 +6728,7 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic:
@@ -6742,7 +6742,6 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
 define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #2 {
 ; GFX6-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -6753,44 +6752,44 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #2 {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, 14, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, 14, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 14, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX10-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
@@ -6798,6 +6797,7 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #2 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3f40e400, v1
 ; GFX11-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp:
@@ -6811,7 +6811,6 @@ define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #2 {
 define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6826,11 +6825,11 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6845,11 +6844,11 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6864,11 +6863,11 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s6, 0x4640e400
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
@@ -6883,11 +6882,11 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x4640e400
 ; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6901,11 +6900,11 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 0x4640e400
 ; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
@@ -6920,6 +6919,7 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x4640e400
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_daz:
@@ -6933,7 +6933,6 @@ define float @v_fdiv_f32_constlhs0_daz(float %x) #0 {
 define float @v_fdiv_f32_constlhs0_daz_25ulp(float %x) #0 {
 ; GFX678-LABEL: v_fdiv_f32_constlhs0_daz_25ulp:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v1, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -6942,22 +6941,22 @@ define float @v_fdiv_f32_constlhs0_daz_25ulp(float %x) #0 {
 ; GFX678-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v0, 0x4640e400, v0
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_constlhs0_daz_25ulp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v0|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4640e400, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_constlhs0_daz_25ulp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v0|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -6965,6 +6964,7 @@ define float @v_fdiv_f32_constlhs0_daz_25ulp(float %x) #0 {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4640e400, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_constlhs0_daz_25ulp:
@@ -6978,7 +6978,6 @@ define float @v_fdiv_f32_constlhs0_daz_25ulp(float %x) #0 {
 define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -6990,11 +6989,11 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7006,11 +7005,11 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -7022,11 +7021,11 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7038,11 +7037,11 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -7054,11 +7053,11 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -7071,6 +7070,7 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_nodenorm_x:
@@ -7084,7 +7084,6 @@ define float @v_fdiv_f32_ieee_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -7098,11 +7097,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7111,11 +7110,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7124,11 +7123,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7137,11 +7136,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7151,6 +7150,7 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x:
@@ -7164,7 +7164,6 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y
 define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7179,11 +7178,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7198,11 +7197,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7217,11 +7216,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7236,11 +7235,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7255,11 +7254,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -7275,6 +7274,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_x:
@@ -7288,7 +7288,6 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
 define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -7302,11 +7301,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7315,11 +7314,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7328,11 +7327,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7341,11 +7340,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7355,6 +7354,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
@@ -7368,7 +7368,6 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7382,11 +7381,11 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7400,11 +7399,11 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7418,11 +7417,11 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7436,11 +7435,11 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -7454,11 +7453,11 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -7473,6 +7472,7 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_nodenorm_x:
@@ -7486,7 +7486,6 @@ define float @v_fdiv_f32_daz_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 define float @v_fdiv_f32_daz_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_x:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -7495,22 +7494,22 @@ define float @v_fdiv_f32_daz_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y)
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_x:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_x:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
@@ -7518,6 +7517,7 @@ define float @v_fdiv_f32_daz_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y)
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_x:
@@ -7531,7 +7531,6 @@ define float @v_fdiv_f32_daz_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y)
 define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -7543,11 +7542,11 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX6-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7559,11 +7558,11 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX6-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -7575,11 +7574,11 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7591,11 +7590,11 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -7607,11 +7606,11 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -7624,6 +7623,7 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_nodenorm_y:
@@ -7637,7 +7637,6 @@ define float @v_fdiv_f32_ieee_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y) #1 {
 ; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -7651,11 +7650,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7664,11 +7663,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7677,11 +7676,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7690,11 +7689,11 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7704,6 +7703,7 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y:
@@ -7717,7 +7717,6 @@ define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y
 define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #2 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7732,11 +7731,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7751,11 +7750,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX6-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7770,11 +7769,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX7-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7789,11 +7788,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX8-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
@@ -7808,11 +7807,11 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
@@ -7828,6 +7827,7 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 ; GFX11-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_nodenorm_y:
@@ -7841,7 +7841,6 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
 define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y) #2 {
 ; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
@@ -7855,11 +7854,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX7-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7868,11 +7867,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -7881,11 +7880,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7894,11 +7893,11 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v0
@@ -7908,6 +7907,7 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
@@ -7921,7 +7921,6 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX6-FASTFMA:       ; %bb.0:
-; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX6-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7935,11 +7934,11 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX6-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX6-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX6-SLOWFMA:       ; %bb.0:
-; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7953,11 +7952,11 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX6-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX6-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX6-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -7971,11 +7970,11 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX7-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
@@ -7989,11 +7988,11 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX8-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
@@ -8007,11 +8006,11 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX10-NEXT:    s_denorm_mode 12
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -8026,6 +8025,7 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX11-NEXT:    s_denorm_mode 12
 ; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_nodenorm_y:
@@ -8039,7 +8039,6 @@ define float @v_fdiv_f32_daz_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 define float @v_fdiv_f32_daz_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y) #0 {
 ; GFX678-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_y:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_mov_b32 s4, 0x6f800000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; GFX678-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -8048,22 +8047,22 @@ define float @v_fdiv_f32_daz_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y)
 ; GFX678-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX678-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_y:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_y:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0x6f800000, |v1|
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
@@ -8071,6 +8070,7 @@ define float @v_fdiv_f32_daz_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y)
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_fdiv_f32_daz_25ulp_nodenorm_y:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 794b10eea58b9bb..ff2b852ddb9ef0c 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -15,7 +15,6 @@
 define float @v_fdiv_f32(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
 ; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -27,11 +26,11 @@ define float @v_fdiv_f32(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -43,11 +42,11 @@ define float @v_fdiv_f32(float %x, float %y) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_f32:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -59,11 +58,11 @@ define float @v_fdiv_f32(float %x, float %y) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_f32:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -75,11 +74,11 @@ define float @v_fdiv_f32(float %x, float %y) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; DAZ-LABEL: v_fdiv_f32:
 ; DAZ:       ; %bb.0:
-; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; DAZ-NEXT:    v_rcp_f32_e32 v3, v2
 ; DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -93,6 +92,7 @@ define float @v_fdiv_f32(float %x, float %y) {
 ; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float %x, %y
   ret float %fdiv
@@ -101,9 +101,9 @@ define float @v_fdiv_f32(float %x, float %y) {
 define float @v_fdiv_f32_afn(float %x, float %y) {
 ; CHECK-LABEL: v_fdiv_f32_afn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float %x, %y
   ret float %fdiv
@@ -112,7 +112,6 @@ define float @v_fdiv_f32_afn(float %x, float %y) {
 define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_arcp:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
 ; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -124,11 +123,11 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_arcp:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -140,11 +139,11 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_f32_arcp:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -156,11 +155,11 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_f32_arcp:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -172,11 +171,11 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; DAZ-LABEL: v_fdiv_f32_arcp:
 ; DAZ:       ; %bb.0:
-; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; DAZ-NEXT:    v_rcp_f32_e32 v3, v2
 ; DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -190,6 +189,7 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 ; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float %x, %y
   ret float %fdiv
@@ -198,9 +198,9 @@ define float @v_fdiv_f32_arcp(float %x, float %y) {
 define float @v_fdiv_f32_arcp_afn(float %x, float %y) {
 ; CHECK-LABEL: v_fdiv_f32_arcp_afn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn float %x, %y
   ret float %fdiv
@@ -209,7 +209,6 @@ define float @v_fdiv_f32_arcp_afn(float %x, float %y) {
 define float @v_fdiv_recip_f32(float %x) {
 ; IEEE-LABEL: v_fdiv_recip_f32:
 ; IEEE:       ; %bb.0:
-; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IEEE-NEXT:    v_rcp_f32_e32 v2, v1
 ; IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -221,11 +220,11 @@ define float @v_fdiv_recip_f32(float %x) {
 ; IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; DAZ-LABEL: v_fdiv_recip_f32:
 ; DAZ:       ; %bb.0:
-; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; DAZ-NEXT:    v_rcp_f32_e32 v2, v1
 ; DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -239,6 +238,7 @@ define float @v_fdiv_recip_f32(float %x) {
 ; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float 1.0, %x
   ret float %fdiv
@@ -247,8 +247,8 @@ define float @v_fdiv_recip_f32(float %x) {
 define float @v_fdiv_recip_f32_afn(float %x) {
 ; CHECK-LABEL: v_fdiv_recip_f32_afn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float 1.0, %x
   ret float %fdiv
@@ -257,7 +257,6 @@ define float @v_fdiv_recip_f32_afn(float %x) {
 define float @v_fdiv_recip_f32_arcp(float %x) {
 ; IEEE-LABEL: v_fdiv_recip_f32_arcp:
 ; IEEE:       ; %bb.0:
-; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IEEE-NEXT:    v_rcp_f32_e32 v2, v1
 ; IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -269,11 +268,11 @@ define float @v_fdiv_recip_f32_arcp(float %x) {
 ; IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; DAZ-LABEL: v_fdiv_recip_f32_arcp:
 ; DAZ:       ; %bb.0:
-; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; DAZ-NEXT:    v_rcp_f32_e32 v2, v1
 ; DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -287,6 +286,7 @@ define float @v_fdiv_recip_f32_arcp(float %x) {
 ; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp float 1.0, %x
   ret float %fdiv
@@ -295,8 +295,8 @@ define float @v_fdiv_recip_f32_arcp(float %x) {
 define float @v_fdiv_recip_f32_arcp_afn(float %x) {
 ; CHECK-LABEL: v_fdiv_recip_f32_arcp_afn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv arcp afn float 1.0, %x
   ret float %fdiv
@@ -305,7 +305,6 @@ define float @v_fdiv_recip_f32_arcp_afn(float %x) {
 define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -335,11 +334,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -369,11 +368,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -403,11 +402,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -437,11 +436,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -472,11 +471,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-DAZ-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -507,11 +506,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -542,11 +541,11 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; IR-DAZ-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -577,6 +576,7 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 ; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv float 1.0, %sqrt
@@ -586,9 +586,9 @@ define float @v_fdiv_recip_sqrt_f32(float %x) {
 define float @v_fdiv_recip_sqrt_f32_afn(float %x) {
 ; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call afn float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv afn float 1.0, %sqrt
@@ -598,13 +598,12 @@ define float @v_fdiv_recip_sqrt_f32_afn(float %x) {
 define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -634,17 +633,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -674,17 +673,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -715,17 +714,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -756,6 +755,7 @@ define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 ; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call arcp float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv arcp float 1.0, %sqrt
@@ -765,54 +765,54 @@ define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
 define float @v_fdiv_recip_sqrt_f32_arcp_afn(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call arcp afn float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv arcp afn float 1.0, %sqrt
@@ -822,13 +822,12 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn(float %x) {
 define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -858,17 +857,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -898,17 +897,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -939,17 +938,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; CODEGEN-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -980,6 +979,7 @@ define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 ; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv arcp float 1.0, %sqrt
@@ -989,7 +989,6 @@ define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
 define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1009,11 +1008,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1033,11 +1032,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1057,11 +1056,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; IR-IEEE-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1081,11 +1080,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1104,11 +1103,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1127,11 +1126,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1150,11 +1149,11 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; IR-DAZ-SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1173,6 +1172,7 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 ; IR-DAZ-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-DAZ-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv afn float 1.0, %sqrt
@@ -1182,13 +1182,12 @@ define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
 define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1208,17 +1207,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1238,17 +1237,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1267,17 +1266,17 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1296,6 +1295,7 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 ; IR-DAZ-GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; IR-DAZ-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; IR-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %x)
   %fdiv = fdiv arcp afn float 1.0, %sqrt
@@ -1305,7 +1305,6 @@ define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
 define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v2, v2
 ; CODEGEN-IEEE-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -1314,11 +1313,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CODEGEN-IEEE-SDAG-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; CODEGEN-IEEE-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -1327,11 +1326,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
@@ -1343,11 +1342,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -1359,11 +1358,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_f32_ulp25:
 ; CODEGEN-DAZ-SDAG:       ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0x6f800000
 ; CODEGEN-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x2f800000
 ; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
@@ -1372,11 +1371,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; CODEGEN-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v1, v1
 ; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_f32_ulp25:
 ; CODEGEN-DAZ-GISEL:       ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v2, 0x6f800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v3, 0x2f800000
 ; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, v2
@@ -1385,11 +1384,11 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v1, v1
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-LABEL: v_fdiv_f32_ulp25:
 ; IR-DAZ:       ; %bb.0:
-; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
 ; IR-DAZ-NEXT:    v_rcp_f32_e32 v3, v2
 ; IR-DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
@@ -1403,6 +1402,7 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 ; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
 ; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float %x, %y, !fpmath !0
   ret float %fdiv
@@ -1411,9 +1411,9 @@ define float @v_fdiv_f32_ulp25(float %x, float %y) {
 define float @v_fdiv_f32_afn_ulp25(float %x, float %y) {
 ; CHECK-LABEL: v_fdiv_f32_afn_ulp25:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float %x, %y, !fpmath !0
   ret float %fdiv
@@ -1422,27 +1422,26 @@ define float @v_fdiv_f32_afn_ulp25(float %x, float %y) {
 define float @v_recip_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_recip_f32_ulp25:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v1, v1
 ; CODEGEN-IEEE-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_recip_f32_ulp25:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v1, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-LABEL: v_recip_f32_ulp25:
 ; IR-IEEE:       ; %bb.0:
-; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
 ; IR-IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1454,17 +1453,17 @@ define float @v_recip_f32_ulp25(float %x) {
 ; IR-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-LABEL: v_recip_f32_ulp25:
 ; CODEGEN-DAZ:       ; %bb.0:
-; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-LABEL: v_recip_f32_ulp25:
 ; IR-DAZ:       ; %bb.0:
-; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-DAZ-NEXT:    v_rcp_f32_e32 v2, v1
 ; IR-DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1478,6 +1477,7 @@ define float @v_recip_f32_ulp25(float %x) {
 ; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv float 1.0, %x, !fpmath !0
   ret float %fdiv
@@ -1486,8 +1486,8 @@ define float @v_recip_f32_ulp25(float %x) {
 define float @v_recip_f32_afn_ulp25(float %x) {
 ; CHECK-LABEL: v_recip_f32_afn_ulp25:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn float 1.0, %x, !fpmath !0
   ret float %fdiv
@@ -1496,7 +1496,6 @@ define float @v_recip_f32_afn_ulp25(float %x) {
 define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1510,11 +1509,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CODEGEN-IEEE-SDAG-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -1527,11 +1526,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CODEGEN-IEEE-GISEL-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; IR-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; IR-IEEE-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1551,11 +1550,11 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; IR-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; IR-IEEE-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -1574,18 +1573,18 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25:
 ; CODEGEN-DAZ:       ; %bb.0:
-; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CODEGEN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25:
 ; IR-DAZ:       ; %bb.0:
-; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; IR-DAZ-NEXT:    v_rcp_f32_e32 v2, v1
@@ -1600,6 +1599,7 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 ; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %x), !fpmath !0
   %fdiv = fdiv float 1.0, %sqrt, !fpmath !0
@@ -1609,7 +1609,6 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
 define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4b800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1619,11 +1618,11 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1633,11 +1632,11 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; IR-IEEE-SDAG:       ; %bb.0:
-; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1667,11 +1666,11 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; IR-IEEE-SDAG-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; IR-IEEE-GISEL:       ; %bb.0:
-; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1701,17 +1700,17 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; CODEGEN-DAZ:       ; %bb.0:
-; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; IR-DAZ-SDAG:       ; %bb.0:
-; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0xf800000
 ; IR-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; IR-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1742,11 +1741,11 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; IR-DAZ-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-SDAG-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-SDAG-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
 ; IR-DAZ-GISEL:       ; %bb.0:
-; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; IR-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1777,6 +1776,7 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 ; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0
   %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0
@@ -1786,9 +1786,9 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
 define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
 ; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sqrt_f32_e32 v0, v0
 ; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call afn float @llvm.sqrt.f32(float %x), !fpmath !0
   %fdiv = fdiv afn float 1.0, %sqrt, !fpmath !0
@@ -1798,34 +1798,34 @@ define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
 define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) {
 ; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
 ; CODEGEN-IEEE-SDAG:       ; %bb.0:
-; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
 ; CODEGEN-IEEE-GISEL:       ; %bb.0:
-; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-IEEE-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
 ; IR-IEEE:       ; %bb.0:
-; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-IEEE-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
 ; CODEGEN-DAZ:       ; %bb.0:
-; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; IR-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
 ; IR-DAZ:       ; %bb.0:
-; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; IR-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn float @llvm.sqrt.f32(float %x), !fpmath !0
   %fdiv = fdiv contract afn float 1.0, %sqrt, !fpmath !0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index f440b1f9ee6f7bb..6768161c8308048 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -11,9 +11,9 @@
 define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-LABEL: test_stack_realign:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 07b3df2a8520aae..9fea3cf58cf4418 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -173,7 +173,6 @@ define amdgpu_kernel void @zero_init_kernel() {
 define void @zero_init_foo() {
 ; GFX9-LABEL: zero_init_foo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    s_mov_b32 s1, s0
 ; GFX9-NEXT:    s_mov_b32 s2, s0
@@ -186,12 +185,11 @@ define void @zero_init_foo() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zero_init_foo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
@@ -204,11 +202,11 @@ define void @zero_init_foo() {
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zero_init_foo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s1, s0
@@ -221,11 +219,11 @@ define void @zero_init_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: zero_init_foo:
 ; GFX9-PAL:       ; %bb.0:
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
@@ -238,12 +236,11 @@ define void @zero_init_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: zero_init_foo:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s0, 0
 ; GFX940-NEXT:    s_mov_b32 s1, s0
 ; GFX940-NEXT:    s_mov_b32 s2, s0
@@ -254,12 +251,11 @@ define void @zero_init_foo() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: zero_init_foo:
 ; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
@@ -272,11 +268,11 @@ define void @zero_init_foo() {
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: zero_init_foo:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
@@ -289,6 +285,7 @@ define void @zero_init_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
@@ -693,7 +690,6 @@ bb:
 define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
@@ -702,12 +698,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
@@ -715,12 +710,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -728,12 +722,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
@@ -742,12 +735,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s32
 ; GFX940-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
@@ -756,12 +748,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_foo:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
@@ -769,12 +760,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -782,7 +772,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -797,59 +787,56 @@ bb:
 define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-NEXT:    scratch_store_dword v0, v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: private_ptr_foo:
 ; GFX9-PAL:       ; %bb.0:
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: private_ptr_foo:
 ; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: private_ptr_foo:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -1040,7 +1027,6 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 define void @zero_init_small_offset_foo() {
 ; GFX9-LABEL: zero_init_small_offset_foo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
@@ -1055,12 +1041,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zero_init_small_offset_foo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
@@ -1075,11 +1060,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zero_init_small_offset_foo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
@@ -1094,11 +1079,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: zero_init_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0:
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
@@ -1113,12 +1098,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: zero_init_small_offset_foo:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s0, 0
@@ -1131,12 +1115,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: zero_init_small_offset_foo:
 ; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
@@ -1151,11 +1134,11 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-PAL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: zero_init_small_offset_foo:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
@@ -1170,6 +1153,7 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-PAL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -1712,7 +1696,6 @@ bb:
 define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
@@ -1724,12 +1707,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
@@ -1741,12 +1723,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
@@ -1756,12 +1737,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x100
@@ -1773,12 +1753,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x100
@@ -1790,12 +1769,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
@@ -1807,12 +1785,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x100
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
@@ -1822,7 +1799,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -2029,7 +2006,6 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 define void @zero_init_large_offset_foo() {
 ; GFX9-LABEL: zero_init_large_offset_foo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, 0
@@ -2048,12 +2024,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zero_init_large_offset_foo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
@@ -2072,11 +2047,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zero_init_large_offset_foo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
@@ -2094,11 +2069,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0:
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
@@ -2117,12 +2092,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: zero_init_large_offset_foo:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_mov_b32 s0, 0
@@ -2139,12 +2113,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX1010-PAL:       ; %bb.0:
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
@@ -2166,11 +2139,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1010-PAL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX1030-PAL:       ; %bb.0:
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
@@ -2189,11 +2162,11 @@ define void @zero_init_large_offset_foo() {
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
+; GFX1030-PAL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: zero_init_large_offset_foo:
 ; GFX11-PAL:       ; %bb.0:
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
@@ -2211,6 +2184,7 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
+; GFX11-PAL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
@@ -2756,7 +2730,6 @@ bb:
 define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2768,12 +2741,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
@@ -2785,12 +2757,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
@@ -2801,12 +2772,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2818,12 +2788,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_large_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
@@ -2836,12 +2805,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
@@ -2853,12 +2821,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
@@ -2869,7 +2836,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -3044,7 +3011,6 @@ bb:
 define void @store_load_large_imm_offset_foo() {
 ; GFX9-LABEL: store_load_large_imm_offset_foo:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-NEXT:    s_add_i32 s1, s32, 4
@@ -3055,12 +3021,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_large_imm_offset_foo:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_movk_i32 s0, 0x3800
@@ -3071,12 +3036,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
 ; GFX11-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX11-NEXT:    s_add_i32 s1, s32, 4
@@ -3087,12 +3051,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 4
@@ -3103,12 +3066,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_large_imm_offset_foo:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX940-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX940-NEXT:    s_add_i32 s1, s32, 4
@@ -3119,12 +3081,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940-NEXT:    scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
@@ -3135,12 +3096,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 4
@@ -3151,7 +3111,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -3302,77 +3262,70 @@ bb:
 define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_aligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i64_aligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i64_aligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i64_aligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i64_aligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_i64_aligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i64_aligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 8
@@ -3383,77 +3336,70 @@ bb:
 define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i64_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i64_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i64_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i64_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i64_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_i64_unaligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i64_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, ptr addrspace(5) %arg, align 1
@@ -3464,84 +3410,77 @@ bb:
 define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v3i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_v3i32_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_v3i32_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_v3i32_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-NEXT:    v_mov_b32_e32 v4, 3
 ; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX11-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
@@ -3552,7 +3491,6 @@ bb:
 define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_v4i32_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 3
@@ -3560,12 +3498,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_v4i32_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 3
@@ -3573,23 +3510,21 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_v4i32_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
@@ -3597,12 +3532,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_v4i32_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX940-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-NEXT:    v_mov_b32_e32 v4, 3
@@ -3610,12 +3544,11 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
@@ -3623,18 +3556,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
 ; GFX11-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
@@ -3645,77 +3577,70 @@ bb:
 define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i32_negative_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    scratch_store_byte v0, v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i32_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
 ; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i32_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
 ; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
@@ -3727,77 +3652,70 @@ bb:
 define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-NEXT:    scratch_store_byte v0, v1, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
 ; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX9-PAL:       ; %bb.0: ; %bb
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
-; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
 ; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
-; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index c4a88ebf4897294..0093114e47c92be 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -10,26 +10,29 @@
 define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
   ret void
@@ -38,32 +41,33 @@ define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
@@ -73,26 +77,29 @@ define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_xchg_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -101,32 +108,33 @@ define i32 @flat_atomic_xchg_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
@@ -136,7 +144,6 @@ define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -144,11 +151,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -156,11 +163,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -168,6 +175,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
   ret void
@@ -176,7 +184,6 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_scalar(ptr inreg %ptr, i32 in
 define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -186,11 +193,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -200,11 +207,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -212,6 +219,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst
@@ -221,7 +229,6 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -229,11 +236,11 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -241,11 +248,11 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -253,6 +260,7 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -261,7 +269,6 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -271,11 +278,11 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -285,11 +292,11 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -297,6 +304,7 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, i32 %in seq_cst
@@ -310,26 +318,29 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3
 define void @flat_atomic_xchg_f32_noret(ptr %ptr, float %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
   ret void
@@ -338,32 +349,33 @@ define void @flat_atomic_xchg_f32_noret(ptr %ptr, float %in) {
 define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
@@ -373,26 +385,29 @@ define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) {
 define float @flat_atomic_xchg_f32_ret(ptr %ptr, float %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, float %in seq_cst
   ret float %result
@@ -401,32 +416,33 @@ define float @flat_atomic_xchg_f32_ret(ptr %ptr, float %in) {
 define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, float %in seq_cst
@@ -436,7 +452,6 @@ define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) {
 define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -444,11 +459,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -456,11 +471,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -468,6 +483,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, float %in seq_cst
   ret void
@@ -476,7 +492,6 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_scalar(ptr inreg %ptr, float
 define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, float inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -486,11 +501,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -500,11 +515,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -512,6 +527,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst
@@ -521,7 +537,6 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -529,11 +544,11 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float i
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -541,11 +556,11 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float i
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -553,6 +568,7 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float i
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, float %in seq_cst
   ret float %result
@@ -561,7 +577,6 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_scalar(ptr inreg %ptr, float i
 define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, float inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -571,11 +586,11 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -585,11 +600,11 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -597,6 +612,7 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, float %in seq_cst
@@ -610,26 +626,29 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out,
 define void @flat_atomic_add_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
   ret void
@@ -638,32 +657,33 @@ define void @flat_atomic_add_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
@@ -673,26 +693,29 @@ define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -701,32 +724,33 @@ define i32 @flat_atomic_add_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw add ptr %gep, i32 %in seq_cst
@@ -736,7 +760,6 @@ define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -744,11 +767,11 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -756,11 +779,11 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -768,6 +791,7 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr %ptr, i32 %in seq_cst
   ret void
@@ -776,7 +800,6 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -786,11 +809,11 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -800,11 +823,11 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_add v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -812,6 +835,7 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_add v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst
@@ -821,7 +845,6 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -829,11 +852,11 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -841,11 +864,11 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -853,6 +876,7 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -861,7 +885,6 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -871,11 +894,11 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -885,11 +908,11 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -897,6 +920,7 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    flat_atomic_add v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw add ptr %gep, i32 %in seq_cst
@@ -910,26 +934,29 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32
 define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
   ret void
@@ -938,32 +965,33 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -973,26 +1001,29 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1001,32 +1032,33 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1036,7 +1068,6 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1044,11 +1075,11 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -1056,11 +1087,11 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1068,6 +1099,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst
   ret void
@@ -1076,7 +1108,6 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -1086,11 +1117,11 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -1100,11 +1131,11 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_sub v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1112,6 +1143,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_sub v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1121,7 +1153,6 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1129,11 +1160,11 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -1141,11 +1172,11 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1153,6 +1184,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1161,7 +1193,6 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -1171,11 +1202,11 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -1185,11 +1216,11 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1197,6 +1228,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    flat_atomic_sub v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw sub ptr %gep, i32 %in seq_cst
@@ -1210,26 +1242,29 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32
 define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
   ret void
@@ -1238,32 +1273,33 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1273,26 +1309,29 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1301,32 +1340,33 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1336,7 +1376,6 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1344,11 +1383,11 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -1356,11 +1395,11 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1368,6 +1407,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst
   ret void
@@ -1376,7 +1416,6 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -1386,11 +1425,11 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -1400,11 +1439,11 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_and v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1412,6 +1451,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_and v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1421,7 +1461,6 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -1429,11 +1468,11 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -1441,11 +1480,11 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1453,6 +1492,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1461,7 +1501,6 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -1471,11 +1510,11 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -1485,11 +1524,11 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -1497,6 +1536,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    flat_atomic_and v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw and ptr %gep, i32 %in seq_cst
@@ -1510,7 +1550,6 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32
 define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -1525,6 +1564,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1533,7 +1573,6 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -1548,6 +1587,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1556,7 +1596,6 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -1571,6 +1610,7 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1583,7 +1623,6 @@ define void @flat_atomic_nand_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -1600,6 +1639,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1608,7 +1648,6 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -1625,6 +1664,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1633,7 +1673,6 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB41_1: ; %atomicrmw.start
@@ -1648,6 +1687,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1661,7 +1701,6 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -1676,6 +1715,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1685,7 +1725,6 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -1700,6 +1739,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1709,7 +1749,6 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -1724,6 +1763,7 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1737,7 +1777,6 @@ define i32 @flat_atomic_nand_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v0, v[3:4]
@@ -1754,6 +1793,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1762,7 +1802,6 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v0, v[3:4]
@@ -1779,6 +1818,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1787,7 +1827,6 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB43_1: ; %atomicrmw.start
@@ -1802,6 +1841,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1816,7 +1856,6 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
@@ -1833,6 +1872,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1841,7 +1881,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -1858,6 +1897,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1866,7 +1906,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -1883,6 +1922,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1895,7 +1935,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
 define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -1914,6 +1953,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1922,7 +1962,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -1941,6 +1980,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1949,7 +1989,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -1966,6 +2005,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1979,7 +2019,6 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
@@ -1998,6 +2037,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2006,7 +2046,6 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
@@ -2025,6 +2064,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2033,7 +2073,6 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
@@ -2052,6 +2091,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2064,7 +2104,6 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
@@ -2083,6 +2122,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2091,7 +2131,6 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN2-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
@@ -2110,6 +2149,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2118,7 +2158,6 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN3-LABEL: flat_atomic_nand_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
@@ -2137,6 +2176,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2154,26 +2194,29 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3
 define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
   ret void
@@ -2182,32 +2225,33 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2217,26 +2261,29 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2245,32 +2292,33 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2280,7 +2328,6 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -2288,11 +2335,11 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
 ; GCN1-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -2300,11 +2347,11 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
 ; GCN2-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2312,6 +2359,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
 ; GCN3-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst
   ret void
@@ -2320,7 +2368,6 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre
 define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -2330,11 +2377,11 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
 ; GCN1-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -2344,11 +2391,11 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
 ; GCN2-NEXT:    flat_atomic_or v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2356,6 +2403,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
 ; GCN3-NEXT:    flat_atomic_or v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2365,7 +2413,6 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i
 define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -2373,11 +2420,11 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
 ; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -2385,11 +2432,11 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
 ; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2397,6 +2444,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
 ; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2405,7 +2453,6 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg %
 define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -2415,11 +2462,11 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -2429,11 +2476,11 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2441,6 +2488,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    flat_atomic_or v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw or ptr %gep, i32 %in seq_cst
@@ -2454,26 +2502,29 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32
 define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
   ret void
@@ -2482,32 +2533,33 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -2517,26 +2569,29 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2545,32 +2600,33 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -2580,7 +2636,6 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -2588,11 +2643,11 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -2600,11 +2655,11 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2612,6 +2667,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst
   ret void
@@ -2620,7 +2676,6 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -2630,11 +2685,11 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -2644,11 +2699,11 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_xor v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2656,6 +2711,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_xor v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -2665,7 +2721,6 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -2673,11 +2728,11 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -2685,11 +2740,11 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2697,6 +2752,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2705,7 +2761,6 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -2715,11 +2770,11 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -2729,11 +2784,11 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -2741,6 +2796,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    flat_atomic_xor v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw xor ptr %gep, i32 %in seq_cst
@@ -2754,7 +2810,6 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32
 define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -2768,6 +2823,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2776,7 +2832,6 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -2790,6 +2845,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2798,7 +2854,6 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -2812,6 +2867,7 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2824,7 +2880,6 @@ define void @flat_atomic_max_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -2840,6 +2895,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2848,7 +2904,6 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -2864,6 +2919,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2872,7 +2928,6 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB65_1: ; %atomicrmw.start
@@ -2886,6 +2941,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2899,7 +2955,6 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -2913,6 +2968,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2922,7 +2978,6 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -2936,6 +2991,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2945,7 +3001,6 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -2959,6 +3014,7 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2972,7 +3028,6 @@ define i32 @flat_atomic_max_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v0, v[3:4]
@@ -2988,6 +3043,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2996,7 +3052,6 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v0, v[3:4]
@@ -3012,6 +3067,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3020,7 +3076,6 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB67_1: ; %atomicrmw.start
@@ -3034,6 +3089,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3048,7 +3104,6 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
@@ -3064,6 +3119,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3072,7 +3128,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -3088,6 +3143,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3096,7 +3152,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -3112,6 +3167,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3124,7 +3180,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -3142,6 +3197,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3150,7 +3206,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -3168,6 +3223,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3176,7 +3232,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -3192,6 +3247,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3205,7 +3261,6 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
@@ -3223,6 +3278,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3231,7 +3287,6 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
@@ -3249,6 +3304,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3257,7 +3313,6 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
@@ -3275,6 +3330,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3287,7 +3343,6 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
@@ -3305,6 +3360,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3313,7 +3369,6 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ;
 ; GCN2-LABEL: flat_atomic_max_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
@@ -3331,6 +3386,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3339,7 +3395,6 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ;
 ; GCN3-LABEL: flat_atomic_max_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
@@ -3357,6 +3412,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3788,7 +3844,6 @@ entry:
 define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -3802,6 +3857,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3810,7 +3866,6 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -3824,6 +3879,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3832,7 +3888,6 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -3846,6 +3901,7 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3858,7 +3914,6 @@ define void @flat_atomic_umax_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -3874,6 +3929,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3882,7 +3938,6 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -3898,6 +3953,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3906,7 +3962,6 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB77_1: ; %atomicrmw.start
@@ -3920,6 +3975,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3933,7 +3989,6 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -3947,6 +4002,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3956,7 +4012,6 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -3970,6 +4025,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3979,7 +4035,6 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -3993,6 +4048,7 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4006,7 +4062,6 @@ define i32 @flat_atomic_umax_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v0, v[3:4]
@@ -4022,6 +4077,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4030,7 +4086,6 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v0, v[3:4]
@@ -4046,6 +4101,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4054,7 +4110,6 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB79_1: ; %atomicrmw.start
@@ -4068,6 +4123,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4082,7 +4138,6 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
@@ -4098,6 +4153,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4106,7 +4162,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -4122,6 +4177,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4130,7 +4186,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -4146,6 +4201,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4158,7 +4214,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_scalar(ptr inreg %ptr, i32 in
 define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -4176,6 +4231,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4184,7 +4240,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -4202,6 +4257,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4210,7 +4266,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -4226,6 +4281,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4239,7 +4295,6 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
@@ -4257,6 +4312,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4265,7 +4321,6 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
@@ -4283,6 +4338,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4291,7 +4347,6 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
@@ -4309,6 +4364,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4321,7 +4377,6 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
@@ -4339,6 +4394,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4347,7 +4403,6 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN2-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
@@ -4365,6 +4420,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4373,7 +4429,6 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN3-LABEL: flat_atomic_umax_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
@@ -4391,6 +4446,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4729,7 +4785,6 @@ entry:
 define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -4743,6 +4798,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4751,7 +4807,6 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -4765,6 +4820,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4773,7 +4829,6 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -4787,6 +4842,7 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4799,7 +4855,6 @@ define void @flat_atomic_umin_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -4815,6 +4870,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4823,7 +4879,6 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -4839,6 +4894,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4847,7 +4903,6 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
@@ -4861,6 +4916,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4874,7 +4930,6 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -4888,6 +4943,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4897,7 +4953,6 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -4911,6 +4966,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4920,7 +4976,6 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -4934,6 +4989,7 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4947,7 +5003,6 @@ define i32 @flat_atomic_umin_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v0, v[3:4]
@@ -4963,6 +5018,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4971,7 +5027,6 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v0, v[3:4]
@@ -4987,6 +5042,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4995,7 +5051,6 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
@@ -5009,6 +5064,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5023,7 +5079,6 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
@@ -5039,6 +5094,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5047,7 +5103,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -5063,6 +5118,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5071,7 +5127,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -5087,6 +5142,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5099,7 +5155,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_scalar(ptr inreg %ptr, i32 in
 define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -5117,6 +5172,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5125,7 +5181,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -5143,6 +5198,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5151,7 +5207,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5167,6 +5222,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5180,7 +5236,6 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
@@ -5198,6 +5253,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5206,7 +5262,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
@@ -5224,6 +5279,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5232,7 +5288,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
@@ -5250,6 +5305,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5262,7 +5318,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
@@ -5280,6 +5335,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5288,7 +5344,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN2-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
@@ -5306,6 +5361,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5314,7 +5370,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ;
 ; GCN3-LABEL: flat_atomic_umin_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
@@ -5332,6 +5387,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5349,7 +5405,6 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3
 define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -5363,6 +5418,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5371,7 +5427,6 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -5385,6 +5440,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5393,7 +5449,6 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -5407,6 +5462,7 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5419,7 +5475,6 @@ define void @flat_atomic_min_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -5435,6 +5490,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v4, v3
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5443,7 +5499,6 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -5459,6 +5514,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v4, v3
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5467,7 +5523,6 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v4, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB96_1: ; %atomicrmw.start
@@ -5481,6 +5536,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v4, v3
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5494,7 +5550,6 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN1-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN1-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -5508,6 +5563,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5517,7 +5573,6 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN2-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN2-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -5531,6 +5586,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5540,7 +5596,6 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -5554,6 +5609,7 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5567,7 +5623,6 @@ define i32 @flat_atomic_min_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v0, v[3:4]
@@ -5583,6 +5638,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5591,7 +5647,6 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v0, v[3:4]
@@ -5607,6 +5662,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5615,7 +5671,6 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB98_1: ; %atomicrmw.start
@@ -5629,6 +5684,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5643,7 +5699,6 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v3, v[0:1]
@@ -5659,6 +5714,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5667,7 +5723,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v3, v[0:1]
@@ -5683,6 +5738,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5691,7 +5747,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1]
@@ -5707,6 +5762,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5719,7 +5775,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_scalar(ptr inreg %ptr, i32 inr
 define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -5737,6 +5792,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v2
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5745,7 +5801,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -5763,6 +5818,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v2
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5771,7 +5827,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v3, v[0:1] offset:16
@@ -5787,6 +5842,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v2
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5800,7 +5856,6 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    flat_load_dword v0, v[0:1]
@@ -5818,6 +5873,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5826,7 +5882,6 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    flat_load_dword v0, v[0:1]
@@ -5844,6 +5899,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5852,7 +5908,6 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1]
@@ -5870,6 +5925,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5882,7 +5938,6 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_scalar(ptr inreg %ptr, i32 inreg
 define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s34
@@ -5900,6 +5955,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5908,7 +5964,6 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ;
 ; GCN2-LABEL: flat_atomic_min_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s34
@@ -5926,6 +5981,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5934,7 +5990,6 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ;
 ; GCN3-LABEL: flat_atomic_min_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dword v0, v[0:1] offset:16
@@ -5952,6 +6007,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6370,26 +6426,29 @@ entry:
 define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
   ret void
@@ -6398,32 +6457,33 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -6433,26 +6493,29 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -6461,32 +6524,33 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -6496,7 +6560,6 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -6504,11 +6567,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -6516,11 +6579,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6528,6 +6591,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN3-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
   ret void
@@ -6536,7 +6600,6 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i
 define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -6546,11 +6609,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN1-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -6560,11 +6623,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN2-NEXT:    flat_atomic_inc v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6572,6 +6635,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN3-NEXT:    flat_atomic_inc v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -6581,7 +6645,6 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg
 define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -6589,11 +6652,11 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -6601,11 +6664,11 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6613,6 +6676,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -6621,7 +6685,6 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -6631,11 +6694,11 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN1-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -6645,11 +6708,11 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN2-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6657,6 +6720,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN3-NEXT:    flat_atomic_inc v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst
@@ -6670,26 +6734,29 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou
 define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
   ret void
@@ -6698,32 +6765,33 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) {
 define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -6733,26 +6801,29 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) {
 define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -6761,32 +6832,33 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) {
 define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 16, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -6796,7 +6868,6 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) {
 define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -6804,11 +6875,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -6816,11 +6887,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6828,6 +6899,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 ; GCN3-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
   ret void
@@ -6836,7 +6908,6 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i
 define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -6846,11 +6917,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN1-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -6860,11 +6931,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN2-NEXT:    flat_atomic_dec v[0:1], v2
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6872,6 +6943,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 ; GCN3-NEXT:    flat_atomic_dec v[0:1], v2 offset:16
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
@@ -6881,7 +6953,6 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg
 define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s6
@@ -6889,11 +6960,11 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s6
@@ -6901,11 +6972,11 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6913,6 +6984,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 ; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst
   ret i32 %result
@@ -6921,7 +6993,6 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32
 define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %out, i32 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 16
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s34
@@ -6931,11 +7002,11 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN1-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 16
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s34
@@ -6945,11 +7016,11 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN2-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s6
@@ -6957,6 +7028,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou
 ; GCN3-NEXT:    flat_atomic_dec v0, v[0:1], v2 offset:16 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr %out, i32 4
   %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index bde6d3fe2658f46..b523ae60a29ad29 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -10,26 +10,29 @@
 define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
   ret void
@@ -38,32 +41,33 @@ define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
@@ -73,26 +77,29 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -101,32 +108,33 @@ define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
@@ -136,7 +144,6 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -145,11 +152,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -158,11 +165,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -171,6 +178,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst
   ret void
@@ -179,7 +187,6 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 in
 define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -190,11 +197,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -205,11 +212,11 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -218,6 +225,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst
@@ -227,7 +235,6 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -236,11 +243,11 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -249,11 +256,11 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -262,6 +269,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -270,7 +278,6 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -281,11 +288,11 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -296,11 +303,11 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -309,6 +316,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw xchg ptr %gep, i64 %in seq_cst
@@ -322,26 +330,29 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6
 define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
   ret void
@@ -350,32 +361,33 @@ define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
 define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
@@ -385,26 +397,29 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
 define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, double %in seq_cst
   ret double %result
@@ -413,32 +428,33 @@ define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
 define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, double %in seq_cst
@@ -448,7 +464,6 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -457,11 +472,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -470,11 +485,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -483,6 +498,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst
   ret void
@@ -491,7 +507,6 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double
 define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -502,11 +517,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -517,11 +532,11 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -530,6 +545,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
   %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst
@@ -539,7 +555,6 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -548,11 +563,11 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -561,11 +576,11 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -574,6 +589,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr %ptr, double %in seq_cst
   ret double %result
@@ -582,7 +598,6 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double
 define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
 ; GCN1-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -593,11 +608,11 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -608,11 +623,11 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -621,6 +636,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr %out, i32 4
   %result = atomicrmw xchg ptr %gep, double %in seq_cst
@@ -634,26 +650,29 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out,
 define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
   ret void
@@ -662,32 +681,33 @@ define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
@@ -697,26 +717,29 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -725,32 +748,33 @@ define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw add ptr %gep, i64 %in seq_cst
@@ -760,7 +784,6 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -769,11 +792,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -782,11 +805,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -795,6 +818,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst
   ret void
@@ -803,7 +827,6 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -814,11 +837,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -829,11 +852,11 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -842,6 +865,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst
@@ -851,7 +875,6 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -860,11 +883,11 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -873,11 +896,11 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -886,6 +909,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -894,7 +918,6 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_add_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -905,11 +928,11 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_add_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -920,11 +943,11 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_add_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -933,6 +956,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw add ptr %gep, i64 %in seq_cst
@@ -946,26 +970,29 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64
 define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
   ret void
@@ -974,32 +1001,33 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
@@ -1009,26 +1037,29 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1037,32 +1068,33 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw sub ptr %gep, i64 %in seq_cst
@@ -1072,7 +1104,6 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1081,11 +1112,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1094,11 +1125,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1107,6 +1138,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst
   ret void
@@ -1115,7 +1147,6 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -1126,11 +1157,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -1141,11 +1172,11 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1154,6 +1185,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst
@@ -1163,7 +1195,6 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1172,11 +1203,11 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1185,11 +1216,11 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1198,6 +1229,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1206,7 +1238,6 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -1217,11 +1248,11 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -1232,11 +1263,11 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1245,6 +1276,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw sub ptr %gep, i64 %in seq_cst
@@ -1258,26 +1290,29 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64
 define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
   ret void
@@ -1286,32 +1321,33 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
@@ -1321,26 +1357,29 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1349,32 +1388,33 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw and ptr %gep, i64 %in seq_cst
@@ -1384,7 +1424,6 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1393,11 +1432,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1406,11 +1445,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1419,6 +1458,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst
   ret void
@@ -1427,7 +1467,6 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -1438,11 +1477,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -1453,11 +1492,11 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1466,6 +1505,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst
@@ -1475,7 +1515,6 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -1484,11 +1523,11 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -1497,11 +1536,11 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1510,6 +1549,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1518,7 +1558,6 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_and_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -1529,11 +1568,11 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_and_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -1544,11 +1583,11 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_and_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -1557,6 +1596,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw and ptr %gep, i64 %in seq_cst
@@ -1570,7 +1610,6 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64
 define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v6, v[0:1]
@@ -1591,6 +1630,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v4
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1599,7 +1639,6 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v6, v[0:1]
@@ -1620,6 +1659,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v4
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1628,7 +1668,6 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -1646,6 +1685,7 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB40_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1658,7 +1698,6 @@ define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -1681,6 +1720,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1689,7 +1729,6 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -1712,6 +1751,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1720,7 +1760,6 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB41_1: ; %atomicrmw.start
@@ -1738,6 +1777,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB41_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1751,7 +1791,6 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -1772,6 +1811,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1782,7 +1822,6 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -1803,6 +1842,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1813,7 +1853,6 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -1831,6 +1870,7 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB42_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1845,7 +1885,6 @@ define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -1868,6 +1907,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1876,7 +1916,6 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -1899,6 +1938,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1907,7 +1947,6 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB43_1: ; %atomicrmw.start
@@ -1925,6 +1964,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB43_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1940,7 +1980,6 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -1967,6 +2006,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1975,7 +2015,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -2002,6 +2041,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2010,7 +2050,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -2032,6 +2071,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB44_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2044,7 +2084,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 in
 define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -2071,6 +2110,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2079,7 +2119,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -2106,6 +2145,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2114,7 +2154,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
@@ -2136,6 +2175,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB45_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2149,7 +2189,6 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -2176,6 +2215,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2184,7 +2224,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -2211,6 +2250,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2219,7 +2259,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -2241,6 +2280,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB46_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2253,7 +2293,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -2280,6 +2319,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2288,7 +2328,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN2-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -2315,6 +2354,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2323,7 +2363,6 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN3-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
@@ -2345,6 +2384,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB47_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2362,26 +2402,29 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6
 define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
   ret void
@@ -2390,32 +2433,33 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
@@ -2425,26 +2469,29 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2453,32 +2500,33 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw or ptr %gep, i64 %in seq_cst
@@ -2488,7 +2536,6 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2497,11 +2544,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN1-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2510,11 +2557,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN2-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2523,6 +2570,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 ; GCN3-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst
   ret void
@@ -2531,7 +2579,6 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre
 define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -2542,11 +2589,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN1-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -2557,11 +2604,11 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN2-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2570,6 +2617,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 ; GCN3-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst
@@ -2579,7 +2627,6 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i
 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2588,11 +2635,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2601,11 +2648,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2614,6 +2661,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2622,7 +2670,6 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %
 define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_or_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -2633,11 +2680,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_or_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -2648,11 +2695,11 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_or_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2661,6 +2708,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw or ptr %gep, i64 %in seq_cst
@@ -2674,26 +2722,29 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64
 define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
   ret void
@@ -2702,32 +2753,33 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
@@ -2737,26 +2789,29 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2765,32 +2820,33 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw xor ptr %gep, i64 %in seq_cst
@@ -2800,7 +2856,6 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2809,11 +2864,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2822,11 +2877,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2835,6 +2890,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst
   ret void
@@ -2843,7 +2899,6 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -2854,11 +2909,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -2869,11 +2924,11 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2882,6 +2937,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst
@@ -2891,7 +2947,6 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -2900,11 +2955,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -2913,11 +2968,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2926,6 +2981,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2934,7 +2990,6 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -2945,11 +3000,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -2960,11 +3015,11 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -2973,6 +3028,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw xor ptr %gep, i64 %in seq_cst
@@ -2986,7 +3042,6 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64
 define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v6, v[0:1]
@@ -3006,6 +3061,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v4
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3014,7 +3070,6 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v6, v[0:1]
@@ -3034,6 +3089,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v4
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3042,7 +3098,6 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -3059,6 +3114,7 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB64_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3071,7 +3127,6 @@ define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -3093,6 +3148,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3101,7 +3157,6 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -3123,6 +3178,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3131,7 +3187,6 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB65_1: ; %atomicrmw.start
@@ -3148,6 +3203,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB65_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3161,7 +3217,6 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -3181,6 +3236,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3191,7 +3247,6 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -3211,6 +3266,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3221,7 +3277,6 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -3238,6 +3293,7 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB66_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3252,7 +3308,6 @@ define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -3274,6 +3329,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3282,7 +3338,6 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -3304,6 +3359,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3312,7 +3368,6 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB67_1: ; %atomicrmw.start
@@ -3329,6 +3384,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB67_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3344,7 +3400,6 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -3372,6 +3427,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3380,7 +3436,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -3408,6 +3463,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3416,7 +3472,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -3439,6 +3494,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB68_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3451,7 +3507,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -3479,6 +3534,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3487,7 +3543,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -3515,6 +3570,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3523,7 +3579,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
@@ -3546,6 +3601,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB69_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3559,7 +3615,6 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -3587,6 +3642,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3595,7 +3651,6 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -3623,6 +3678,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3631,7 +3687,6 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -3654,6 +3709,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB70_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3666,7 +3722,6 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_max_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -3694,6 +3749,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3702,7 +3758,6 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ;
 ; GCN2-LABEL: flat_atomic_max_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -3730,6 +3785,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3738,7 +3794,6 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ;
 ; GCN3-LABEL: flat_atomic_max_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
@@ -3761,6 +3816,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB71_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4228,7 +4284,6 @@ entry:
 define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v6, v[0:1]
@@ -4248,6 +4303,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v4
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4256,7 +4312,6 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v6, v[0:1]
@@ -4276,6 +4331,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v4
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4284,7 +4340,6 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -4301,6 +4356,7 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB76_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4313,7 +4369,6 @@ define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -4335,6 +4390,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4343,7 +4399,6 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -4365,6 +4420,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4373,7 +4429,6 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB77_1: ; %atomicrmw.start
@@ -4390,6 +4445,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB77_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4403,7 +4459,6 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -4423,6 +4478,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4433,7 +4489,6 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -4453,6 +4508,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4463,7 +4519,6 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -4480,6 +4535,7 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB78_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4494,7 +4550,6 @@ define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -4516,6 +4571,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4524,7 +4580,6 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -4546,6 +4601,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4554,7 +4610,6 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB79_1: ; %atomicrmw.start
@@ -4571,6 +4626,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB79_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4586,7 +4642,6 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -4614,6 +4669,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4622,7 +4678,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -4650,6 +4705,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4658,7 +4714,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -4681,6 +4736,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB80_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4693,7 +4749,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 in
 define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -4721,6 +4776,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4729,7 +4785,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -4757,6 +4812,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4765,7 +4821,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
@@ -4788,6 +4843,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB81_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4801,7 +4857,6 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -4829,6 +4884,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4837,7 +4893,6 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -4865,6 +4920,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4873,7 +4929,6 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -4896,6 +4951,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB82_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4908,7 +4964,6 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -4936,6 +4991,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4944,7 +5000,6 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN2-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -4972,6 +5027,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4980,7 +5036,6 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN3-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
@@ -5003,6 +5058,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB83_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5365,7 +5421,6 @@ entry:
 define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v6, v[0:1]
@@ -5385,6 +5440,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v4
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5393,7 +5449,6 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v6, v[0:1]
@@ -5413,6 +5468,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v4
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5421,7 +5477,6 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -5438,6 +5493,7 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB87_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5450,7 +5506,6 @@ define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -5472,6 +5527,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5480,7 +5536,6 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -5502,6 +5557,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5510,7 +5566,6 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB88_1: ; %atomicrmw.start
@@ -5527,6 +5582,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB88_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5540,7 +5596,6 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -5560,6 +5615,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5570,7 +5626,6 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -5590,6 +5645,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5600,7 +5656,6 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -5617,6 +5672,7 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB89_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5631,7 +5687,6 @@ define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -5653,6 +5708,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5661,7 +5717,6 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -5683,6 +5738,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5691,7 +5747,6 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB90_1: ; %atomicrmw.start
@@ -5708,6 +5763,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB90_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5723,7 +5779,6 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -5751,6 +5806,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5759,7 +5815,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -5787,6 +5842,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5795,7 +5851,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -5818,6 +5873,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB91_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5830,7 +5886,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 in
 define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -5858,6 +5913,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5866,7 +5922,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -5894,6 +5949,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5902,7 +5958,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
@@ -5925,6 +5980,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB92_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5938,7 +5994,6 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -5966,6 +6021,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5974,7 +6030,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -6002,6 +6057,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6010,7 +6066,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -6033,6 +6088,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB93_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6045,7 +6101,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -6073,6 +6128,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6081,7 +6137,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN2-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -6109,6 +6164,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6117,7 +6173,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ;
 ; GCN3-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
@@ -6140,6 +6195,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB94_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6157,7 +6213,6 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6
 define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v6, v[0:1]
@@ -6177,6 +6232,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v4
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6185,7 +6241,6 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v6, v[0:1]
@@ -6205,6 +6260,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v4
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6213,7 +6269,6 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -6230,6 +6285,7 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB95_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6242,7 +6298,6 @@ define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -6264,6 +6319,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN1-NEXT:    v_mov_b32_e32 v6, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6272,7 +6328,6 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -6294,6 +6349,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    v_mov_b32_e32 v7, v1
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN2-NEXT:    v_mov_b32_e32 v6, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6302,7 +6358,6 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB96_1: ; %atomicrmw.start
@@ -6319,6 +6374,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN3-NEXT:    v_mov_b32_e32 v6, v4
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB96_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6332,7 +6388,6 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN1-NEXT:    flat_load_dword v4, v[0:1]
@@ -6352,6 +6407,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6362,7 +6418,6 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
 ; GCN2-NEXT:    flat_load_dword v4, v[0:1]
@@ -6382,6 +6437,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6392,7 +6448,6 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -6409,6 +6464,7 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB97_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6423,7 +6479,6 @@ define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
@@ -6445,6 +6500,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6453,7 +6509,6 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
@@ -6475,6 +6530,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6483,7 +6539,6 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
 ; GCN3-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN3-NEXT:  .LBB98_1: ; %atomicrmw.start
@@ -6500,6 +6555,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GCN3-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB98_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6515,7 +6571,6 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -6543,6 +6598,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6551,7 +6607,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -6579,6 +6634,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6587,7 +6643,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -6610,6 +6665,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB99_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6622,7 +6678,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inr
 define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -6650,6 +6705,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN1-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN1-NEXT:    v_mov_b32_e32 v2, v0
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6658,7 +6714,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -6686,6 +6741,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN2-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN2-NEXT:    v_mov_b32_e32 v2, v0
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6694,7 +6750,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
@@ -6717,6 +6772,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 ; GCN3-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GCN3-NEXT:    v_mov_b32_e32 v2, v0
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB100_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6730,7 +6786,6 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out,
 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN1-NEXT:    s_add_u32 s34, s4, 4
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s5
@@ -6758,6 +6813,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6766,7 +6822,6 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN2-NEXT:    s_add_u32 s34, s4, 4
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s5
@@ -6794,6 +6849,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6802,7 +6858,6 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -6825,6 +6880,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB101_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6837,7 +6893,6 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg
 define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_min_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    s_add_u32 s36, s4, 36
@@ -6865,6 +6920,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN1-NEXT:    buffer_wbinvl1_vol
 ; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN1-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN1-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN1-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6873,7 +6929,6 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ;
 ; GCN2-LABEL: flat_atomic_min_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    s_add_u32 s36, s4, 36
@@ -6901,6 +6956,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN2-NEXT:    buffer_wbinvl1_vol
 ; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN2-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN2-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN2-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6909,7 +6965,6 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ;
 ; GCN3-LABEL: flat_atomic_min_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN3-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
@@ -6932,6 +6987,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64
 ; GCN3-NEXT:    buffer_wbinvl1_vol
 ; GCN3-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GCN3-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GCN3-NEXT:    s_cbranch_execnz .LBB102_1
 ; GCN3-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7392,26 +7448,29 @@ entry:
 define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
   ret void
@@ -7420,32 +7479,33 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
@@ -7455,26 +7515,29 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -7483,32 +7546,33 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
@@ -7518,7 +7582,6 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -7527,11 +7590,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -7540,11 +7603,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7553,6 +7616,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
   ret void
@@ -7561,7 +7625,6 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i
 define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -7572,11 +7635,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -7587,11 +7650,11 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7600,6 +7663,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
@@ -7609,7 +7673,6 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg
 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -7618,11 +7681,11 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -7631,11 +7694,11 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7644,6 +7707,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -7652,7 +7716,6 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -7663,11 +7726,11 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN1-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -7678,11 +7741,11 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN2-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7691,6 +7754,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst
@@ -7704,26 +7768,29 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou
 define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
   ret void
@@ -7732,32 +7799,33 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
 define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
@@ -7767,26 +7835,29 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
 define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -7795,32 +7866,33 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
 define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
 ; GCN1-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GCN2-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
@@ -7830,7 +7902,6 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -7839,11 +7910,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -7852,11 +7923,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7865,6 +7936,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
   ret void
@@ -7873,7 +7945,6 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i
 define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -7884,11 +7955,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -7899,11 +7970,11 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7912,6 +7983,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
@@ -7921,7 +7993,6 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg
 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN1-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s4
@@ -7930,11 +8001,11 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN2-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s4
@@ -7943,11 +8014,11 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -7956,6 +8027,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst
   ret i64 %result
@@ -7964,7 +8036,6 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64
 define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
 ; GCN1-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
 ; GCN1:       ; %bb.0:
-; GCN1-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    s_add_u32 s34, s4, 32
 ; GCN1-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN1-NEXT:    v_mov_b32_e32 v2, s34
@@ -7975,11 +8046,11 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN1-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN1-NEXT:    buffer_wbinvl1_vol
+; GCN1-NEXT:    s_waitcnt expcnt(0)
 ; GCN1-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN2-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
 ; GCN2:       ; %bb.0:
-; GCN2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    s_add_u32 s34, s4, 32
 ; GCN2-NEXT:    s_addc_u32 s35, s5, 0
 ; GCN2-NEXT:    v_mov_b32_e32 v2, s34
@@ -7990,11 +8061,11 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN2-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; GCN2-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN2-NEXT:    buffer_wbinvl1_vol
+; GCN2-NEXT:    s_waitcnt expcnt(0)
 ; GCN2-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
 ; GCN3:       ; %bb.0:
-; GCN3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN3-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN3-NEXT:    v_mov_b32_e32 v2, s4
@@ -8003,6 +8074,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou
 ; GCN3-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
 ; GCN3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN3-NEXT:    buffer_wbinvl1_vol
+; GCN3-NEXT:    s_waitcnt expcnt(0)
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr %out, i64 4
   %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 4ed3abff0ad8515..87d7485e2da11fd 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -12,20 +12,20 @@ declare half @llvm.maxnum.f16(half, half)
 define half @test_fma(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_fma:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %r
@@ -35,20 +35,20 @@ define half @test_fma(half %x, half %y, half %z) {
 define half @test_fmac(half %x, half %y, half %z) {
 ; GFX9-LABEL: test_fmac:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_fma_f16 v0, v1, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_fmac:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_fmac:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %y, half %z, half %x)
   ret half %r
@@ -58,28 +58,28 @@ define half @test_fmac(half %x, half %y, half %z) {
 define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX9-SDAG-LABEL: test_fmaak:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x4200
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_fmaak:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4200
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_fmaak:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_fmaak:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
   ret half %r
@@ -89,28 +89,28 @@ define half @test_fmaak(half %x, half %y, half %z) {
 define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX9-SDAG-LABEL: test_fmamk:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x4200
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v0, s4, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_fmamk:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4200
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_fmamk:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_fmamk:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
   ret half %r
@@ -120,7 +120,6 @@ define half @test_fmamk(half %x, half %y, half %z) {
 define i32 @test_D139469_f16(half %arg) {
 ; GFX9-SDAG-LABEL: test_D139469_f16:
 ; GFX9-SDAG:       ; %bb.0: ; %bb
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x291e
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x211e
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
@@ -128,11 +127,11 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_D139469_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %bb
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x291e
 ; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v2
@@ -141,22 +140,22 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e64 s[4:5], 0, v0
 ; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_D139469_f16:
 ; GFX10-SDAG:       ; %bb.0: ; %bb
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX10-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
 ; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v2, v1
 ; GFX10-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: test_D139469_f16:
 ; GFX10-GISEL:       ; %bb.0: ; %bb
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX10-GISEL-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
@@ -164,11 +163,11 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e64 s4, 0, v1
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_D139469_f16:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -177,11 +176,11 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_D139469_f16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -192,6 +191,7 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract half %arg, 0xH291E
@@ -206,7 +206,6 @@ bb:
 define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX9-SDAG-LABEL: test_D139469_v2f16:
 ; GFX9-SDAG:       ; %bb.0: ; %bb
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x291e
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x211e
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0]
@@ -217,11 +216,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX9-SDAG-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_D139469_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %bb
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x291e291e
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
@@ -235,11 +234,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_D139469_v2f16:
 ; GFX10-SDAG:       ; %bb.0: ; %bb
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x211e
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s4 op_sel_hi:[0,1,0]
@@ -249,11 +248,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-SDAG-NEXT:    v_cmp_lt_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: test_D139469_v2f16:
 ; GFX10-GISEL:       ; %bb.0: ; %bb
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1]
@@ -266,11 +265,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, s5, s6
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_D139469_v2f16:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x211e
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
@@ -282,11 +281,11 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_D139469_v2f16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -305,6 +304,7 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL-NEXT:    s_or_b32 s0, s1, s2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index c6dc5f8d26704a9..0c3a277c1335521 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -20,39 +20,39 @@
 define float @unsafe_fmul_fadd_distribute_fast_f32(float %arg0, float %arg1) #0 {
 ; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, v1, v0, v0
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fmac_f32_e32 v0, v1, v0
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_fmac_f32_e32 v0, v1, v0
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mac_f32_e32 v0, v1, v0
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_f32:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_fmac_f32_e32 v0, v1, v0
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd fast float %arg1, 1.0
   %tmp1 = fmul fast float %arg0, %add
@@ -62,39 +62,39 @@ define float @unsafe_fmul_fadd_distribute_fast_f32(float %arg0, float %arg1) #0
 define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0 {
 ; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_sub_f32_e32 v1, 1.0, v1
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mad_f32 v0, -v1, v0, v0
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_fma_f32 v0, -v1, v0, v0
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = fsub fast float 1.0, %arg1
   %tmp1 = fmul fast float %arg0, %add
@@ -104,45 +104,45 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0
 define <2 x float> @unsafe_fmul_fadd_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
 ; FMA-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, v2, v0, v0
 ; FMA-NEXT:    v_fma_f32 v1, v3, v1, v1
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_add_f32_e32 v3, 1.0, v3
 ; NOFUSE-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fmac_f32_e32 v0, v2, v0
 ; FMAGFX10-NEXT:    v_fmac_f32_e32 v1, v3, v1
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_dual_fmac_f32 v0, v2, v0 :: v_dual_fmac_f32 v1, v3, v1
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mac_f32_e32 v0, v2, v0
 ; FMAD-NEXT:    v_mac_f32_e32 v1, v3, v1
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_fmac_f32_e32 v0, v2, v0
 ; FMADGFX10-NEXT:    v_fmac_f32_e32 v1, v3, v1
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd fast <2 x float> %arg1, <float 1.0, float 1.0>
   %tmp1 = fmul fast <2 x float> %arg0, %add
@@ -152,46 +152,46 @@ define <2 x float> @unsafe_fmul_fadd_distribute_fast_v2f32(<2 x float> %arg0, <2
 define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2 x float> %arg1) #0 {
 ; FMA-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, -v2, v0, v0
 ; FMA-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_sub_f32_e32 v3, 1.0, v3
 ; NOFUSE-NEXT:    v_sub_f32_e32 v2, 1.0, v2
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; NOFUSE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fma_f32 v0, -v2, v0, v0
 ; FMAGFX10-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_fma_f32 v0, -v2, v0, v0
 ; FMAGFX11-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mad_f32 v0, -v2, v0, v0
 ; FMAD-NEXT:    v_mad_f32 v1, -v3, v1, v1
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_fma_f32 v0, -v2, v0, v0
 ; FMADGFX10-NEXT:    v_fma_f32 v1, -v3, v1, v1
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = fsub  fast <2 x float> <float 1.0, float 1.0>, %arg1
   %tmp1 = fmul fast <2 x float> %arg0, %add
@@ -201,39 +201,39 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2
 define <2 x float> @unsafe_fast_fmul_fadd_distribute_post_legalize_f32(float %arg0, <2 x float> %arg1) #0 {
 ; FMA-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_fma_f32 v0, v0, v1, v1
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mad_f32 v0, v0, v1, v1
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_mad_f32 v0, v0, v1, v1
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd fast float %arg0, 1.0
   %splat = insertelement <2 x float> undef, float %add, i32 0
@@ -244,39 +244,39 @@ define <2 x float> @unsafe_fast_fmul_fadd_distribute_post_legalize_f32(float %ar
 define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <2 x float> %arg1) #0 {
 ; FMA-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; FMA:       ; %bb.0:
-; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOFUSE-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; NOFUSE:       ; %bb.0:
-; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    v_sub_f32_e32 v0, 1.0, v0
 ; NOFUSE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; NOFUSE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOFUSE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX10-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; FMAGFX10:       ; %bb.0:
-; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMAGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAGFX11-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; FMAGFX11:       ; %bb.0:
-; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    v_fma_f32 v0, -v0, v1, v1
+; FMAGFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAGFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; FMAD:       ; %bb.0:
-; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    v_mad_f32 v0, -v0, v1, v1
+; FMAD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMAD-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FMADGFX10-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
 ; FMADGFX10:       ; %bb.0:
-; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    v_mad_f32 v0, -v0, v1, v1
+; FMADGFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FMADGFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub fast float 1.0, %arg0
   %splat = insertelement <2 x float> undef, float %sub, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index ec351e9b69370ee..54c6850ab6bc06e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -551,7 +551,6 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 ; SI-LABEL: no_fmax3_v2f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -572,11 +571,11 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v2
 ; SI-NEXT:    v_max3_f32 v0, v4, v0, v6
 ; SI-NEXT:    v_max3_f32 v1, v5, v1, v7
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: no_fmax3_v2f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -584,23 +583,24 @@ define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; VI-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: no_fmax3_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: no_fmax3_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index e874ee56f594cae..ec362bf6c16fb4d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -14,61 +14,61 @@
 define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt half %a, %b
   %val = select i1 %cmp, half %a, half %b
@@ -78,7 +78,6 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
 define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
@@ -87,17 +86,17 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
@@ -106,19 +105,19 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -129,11 +128,11 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -144,11 +143,11 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v2
 ; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -157,12 +156,13 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -172,7 +172,6 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
@@ -183,18 +182,18 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
@@ -205,20 +204,20 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -234,11 +233,11 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v3, v0
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v4, v1
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v5, v2
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -254,11 +253,11 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v3
 ; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v4
 ; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
@@ -269,13 +268,14 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
@@ -285,7 +285,6 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -301,18 +300,18 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -329,22 +328,22 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -365,11 +364,11 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -390,11 +389,11 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v5
 ; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v6
 ; SI-NNAN-NEXT:    v_max_f32_e32 v3, v3, v7
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -411,13 +410,14 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -427,7 +427,6 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -457,20 +456,20 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
 ; GFX9-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -503,11 +502,11 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -520,11 +519,11 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
 ; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
 ; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -565,11 +564,11 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v5, v13, v5
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v6, v14, v6
 ; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v7, v15, v7
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
@@ -610,11 +609,11 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_max_f32_e32 v5, v5, v13
 ; SI-NNAN-NEXT:    v_max_f32_e32 v6, v6, v14
 ; SI-NNAN-NEXT:    v_max_f32_e32 v7, v7, v15
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -645,15 +644,16 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
 ; GFX11-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index e9bf515daabca9f..c9240e355cee242 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -19,48 +19,48 @@ declare half @llvm.fabs.f16(half) #0
 define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v3, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -73,48 +73,48 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_flags:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_flags:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_flags:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_flags:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v3, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_flags:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -127,48 +127,44 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v1, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; GFX7-SDAG-NEXT:    flat_store_dword v[3:4], v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v1, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; GFX7-GISEL-NEXT:    flat_store_dword v[3:4], v1
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_f16_multi_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v1, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; GFX8-NEXT:    flat_store_dword v[3:4], v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v1, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; GFX9-NEXT:    global_store_dword v[3:4], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -182,46 +178,46 @@ define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, p
 define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, 2.0, v0, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, 2.0, v0, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v2, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, 2.0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_k0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_med3_f32 v0, 2.0, v0, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg1.ext = fpext half %arg1 to float
   %arg2.ext = fpext half %arg2 to float
@@ -233,46 +229,46 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k1:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, 2.0, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k1:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, 2.0, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k1:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v2, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, 2.0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_k1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_med3_f32 v0, v0, 2.0, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg2.ext = fpext half %arg2 to float
@@ -284,46 +280,46 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k2:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, 2.0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k2:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, 2.0
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k2:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k2:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, 2.0, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_k2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, 2.0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -335,49 +331,49 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 {
 define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41800000
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, 0, v1, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41800000
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, 0, v1, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4c00
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v2, 0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v1, 0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4c00
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, 0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, 0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41800000
 ; GFX9-NEXT:    v_med3_f32 v0, 0, v1, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg2.ext = fpext half %arg2 to float
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 16.0, float %arg2.ext)
@@ -388,44 +384,44 @@ define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, 0, v0, 2.0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, 0, v0, 2.0
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, 0, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, 0, v0
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, 2.0, v0
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, 0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, 0, v0
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, 2.0, v1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_k0_k2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_med3_f32 v0, 0, v0, 2.0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg1.ext = fpext half %arg1 to float
   %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %arg1.ext, float 2.0)
@@ -436,7 +432,6 @@ define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 {
 define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fabs:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -446,44 +441,45 @@ define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fabs:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v2, |v2|
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fabs:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
 ; GFX8-SDAG-NEXT:    v_min_f16_e64 v0, |v0|, |v1|
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v0, v0, |v2|
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fabs:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v0, |v0|, |v1|
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v1, v3, |v2|
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, |v2|
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fabs.arg0 = call half @llvm.fabs.f16(half %arg0)
   %fabs.arg1 = call half @llvm.fabs.f16(half %arg1)
@@ -499,40 +495,40 @@ define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_fabs_f32_fpext_f16:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, |v0|, |v1|, |v2|
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_fabs_f32_fpext_f16:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, |v0|, |v1|, |v2|
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_fabs_f32_fpext_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, |v0|, |v1|, |v2|
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_fabs_f32_fpext_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, |v0|, |v1|, |v2|
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -548,48 +544,48 @@ define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v2, -v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v3, -v0, -v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e64 v0, -v0, -v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v0, v0, -v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e64 v3, -v0, -v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v1, v3, -v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fneg.arg0 = fneg half %arg0
   %fneg.arg1 = fneg half %arg1
@@ -605,40 +601,40 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_fneg_f32_fpext_f16:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_fneg_f32_fpext_f16:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_fneg_f32_fpext_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_fneg_f32_fpext_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -654,7 +650,6 @@ define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -664,44 +659,45 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, -v0, -v1, -v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v2, -|v2|
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
 ; GFX8-SDAG-NEXT:    v_min_f16_e64 v0, -|v0|, -|v1|
 ; GFX8-SDAG-NEXT:    v_max_f16_e64 v0, v0, -|v2|
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v0, -|v0|, -|v1|
 ; GFX8-GISEL-NEXT:    v_max_f16_e64 v1, v3, -|v2|
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -|v1|
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -|v2|
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %fabs.arg0 = call half @llvm.fabs.f16(half %arg0)
   %fabs.arg1 = call half @llvm.fabs.f16(half %arg1)
@@ -720,40 +716,40 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #
 define half @fmed3_fneg_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -775,29 +771,29 @@ define half @fmed3_fneg_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #
 define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -810,61 +806,56 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v1, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    flat_store_dword v[3:4], v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_mov_b32_e32 v0, v1
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v2
 ; GFX7-GISEL-NEXT:    flat_store_dword v[3:4], v5
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v5, v0, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX8-SDAG-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_0:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX8-GISEL-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v3, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v2
 ; GFX9-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX9-NEXT:    v_med3_f32 v0, v5, v0, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   store float %arg0.ext, ptr addrspace(1) %ptr
@@ -878,60 +869,55 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2,
 define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    flat_store_dword v[3:4], v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    flat_store_dword v[3:4], v1
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX8-SDAG-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX8-GISEL-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v3, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    global_store_dword v[3:4], v1, off
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -945,60 +931,55 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2,
 define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2, ptr addrspace(1) %ptr) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    flat_store_dword v[3:4], v2
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    flat_store_dword v[3:4], v2
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX8-SDAG-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v3, v0, v1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v3, v0
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_multi_use_2:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v5, v2
 ; GFX8-GISEL-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v3, v2
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_multi_use_2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    global_store_dword v[3:4], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -1012,24 +993,24 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2,
 define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 ; GFX7-LABEL: fmed3_f32_fpext_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext bfloat %arg0 to float
   %arg1.ext = fpext bfloat %arg1 to float
@@ -1042,28 +1023,28 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
 define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1 {
 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext bfloat %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -1076,28 +1057,28 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
 define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1 {
 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext bfloat %arg1 to float
@@ -1110,28 +1091,28 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
 define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1 {
 ; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fmed3_f32_fpext_f16_bf16_2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
@@ -1144,61 +1125,61 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
 define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, s4, v0, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v2, v0, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX8-SDAG-NEXT:    v_med3_f32 v0, s4, v0, v1
 ; GFX8-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-GISEL-NEXT:    v_med3_f32 v0, v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX9-SDAG-NEXT:    v_med3_f32 v0, s4, v0, v1
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-GISEL-NEXT:    v_med3_f32 v0, v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %arg1.ext = fpext half %arg1 to float
   %arg2.ext = fpext half %arg2 to float
@@ -1210,61 +1191,61 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, s4, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v2, v1
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX8-SDAG-NEXT:    v_med3_f32 v0, v0, s4, v1
 ; GFX8-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-GISEL-NEXT:    v_med3_f32 v0, v0, v2, v1
 ; GFX8-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX9-SDAG-NEXT:    v_med3_f32 v0, v0, s4, v1
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-GISEL-NEXT:    v_med3_f32 v0, v0, v2, v1
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg2.ext = fpext half %arg2 to float
@@ -1276,61 +1257,61 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 {
 define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 {
 ; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX7-SDAG-NEXT:    v_med3_f32 v0, v0, v1, s4
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX7-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX8-SDAG-NEXT:    v_med3_f32 v0, v0, v1, s4
 ; GFX8-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX8-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX8-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX9-SDAG-NEXT:    v_med3_f32 v0, v0, v1, s4
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX9-GISEL-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %arg0.ext = fpext half %arg0 to float
   %arg1.ext = fpext half %arg1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 44d8565d304da68..7fe48f2196d6a05 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -551,7 +551,6 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 ; SI-LABEL: no_fmin3_v2f16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -572,11 +571,11 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v2
 ; SI-NEXT:    v_min3_f32 v0, v4, v0, v6
 ; SI-NEXT:    v_min3_f32 v1, v5, v1, v7
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: no_fmin3_v2f16:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_min_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -584,23 +583,24 @@ define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; VI-NEXT:    v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_min_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: no_fmin3_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: no_fmin3_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_min_f16 v0, v2, v0
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 0723290bdf734dd..69c322b84dd56f4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -15,61 +15,61 @@
 define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmin_legacy_ule_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmin_legacy_ule_f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
@@ -79,7 +79,6 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
 define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
@@ -88,17 +87,17 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
@@ -107,19 +106,19 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -130,11 +129,11 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -145,11 +144,11 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v2
 ; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -158,12 +157,13 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
@@ -173,7 +173,6 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
@@ -184,18 +183,18 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
@@ -206,20 +205,20 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -235,11 +234,11 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v3, v0
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v4, v1
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v5, v2
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -255,11 +254,11 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v3
 ; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v4
 ; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
@@ -270,13 +269,14 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
@@ -286,7 +286,6 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -302,18 +301,18 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -330,22 +329,22 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -366,11 +365,11 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -391,11 +390,11 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v5
 ; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v6
 ; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v7
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -412,13 +411,14 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -428,7 +428,6 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -458,20 +457,20 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
 ; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
 ; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
 ; GFX9-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -504,11 +503,11 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
 ; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -521,11 +520,11 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
 ; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
 ; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -566,11 +565,11 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v5, v13, v5
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v6, v14, v6
 ; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v7, v15, v7
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
 ; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
@@ -611,11 +610,11 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v5, v5, v13
 ; SI-NNAN-NEXT:    v_min_f32_e32 v6, v6, v14
 ; SI-NNAN-NEXT:    v_min_f32_e32 v7, v7, v15
+; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
@@ -646,15 +645,16 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
 ; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
 ; GFX11-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 0ff5ea6ee7b1f12..94a50153503207c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -19,8 +19,8 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
 define float @v_mul_42_f32(float %x) {
 ; GCN-LABEL: v_mul_42_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 42.0
   ret float %mul
@@ -29,24 +29,24 @@ define float @v_mul_42_f32(float %x) {
 define double @v_mul_42_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_42_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40450000
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_42_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40450000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_42_f64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_mul_f64 v[0:1], 0x40450000, v[0:1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 42.0
   ret double %mul
@@ -55,8 +55,8 @@ define double @v_mul_42_f64(double %x) {
 define half @v_mul_42_f16(half %x) {
 ; GCN-LABEL: v_mul_42_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 42.0
   ret half %mul
@@ -65,22 +65,22 @@ define half @v_mul_42_f16(half %x) {
 define <2 x half> @v_mul_42_v2f16(<2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_42_v2f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5140
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_42_v2f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x51405140
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_42_v2f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x5140, v0 op_sel_hi:[0,1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half 42.0, half 42.0>
   ret <2 x half> %mul
@@ -90,8 +90,8 @@ define <2 x half> @v_mul_42_v2f16(<2 x half> %x) {
 define float @v_mul_0x1pn17_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1pn17_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x37000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 0.00000762939453125
   ret float %mul
@@ -101,8 +101,8 @@ define float @v_mul_0x1pn17_f32(float %x) {
 define float @v_mul_0x1pn16_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1pn16_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x37800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 0.0000152587890625
   ret float %mul
@@ -112,8 +112,8 @@ define float @v_mul_0x1pn16_f32(float %x) {
 define float @v_mul_0x1pn15_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1pn15_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x38000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 0.000030517578125
   ret float %mul
@@ -122,8 +122,8 @@ define float @v_mul_0x1pn15_f32(float %x) {
 define float @v_mul_neg256_f32(float %x) {
 ; GCN-LABEL: v_mul_neg256_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc3800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -256.0
   ret float %mul
@@ -132,8 +132,8 @@ define float @v_mul_neg256_f32(float %x) {
 define float @v_mul_neg128_f32(float %x) {
 ; GCN-LABEL: v_mul_neg128_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc3000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -128.0
   ret float %mul
@@ -142,8 +142,8 @@ define float @v_mul_neg128_f32(float %x) {
 define float @v_mul_neg64_f32(float %x) {
 ; GCN-LABEL: v_mul_neg64_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc2800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -64.0
   ret float %mul
@@ -152,8 +152,8 @@ define float @v_mul_neg64_f32(float %x) {
 define float @v_mul_neg32_f32(float %x) {
 ; GCN-LABEL: v_mul_neg32_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc2000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -32.0
   ret float %mul
@@ -162,8 +162,8 @@ define float @v_mul_neg32_f32(float %x) {
 define float @v_mul_neg16_f32(float %x) {
 ; GCN-LABEL: v_mul_neg16_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc1800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -16.0
   ret float %mul
@@ -172,8 +172,8 @@ define float @v_mul_neg16_f32(float %x) {
 define float @v_mul_neg8_f32(float %x) {
 ; GCN-LABEL: v_mul_neg8_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xc1000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -8.0
   ret float %mul
@@ -182,8 +182,8 @@ define float @v_mul_neg8_f32(float %x) {
 define float @v_mul_neg4_f32(float %x) {
 ; GCN-LABEL: v_mul_neg4_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -4.0
   ret float %mul
@@ -192,8 +192,8 @@ define float @v_mul_neg4_f32(float %x) {
 define float @v_mul_neg2_f32(float %x) {
 ; GCN-LABEL: v_mul_neg2_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -2.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -2.0
   ret float %mul
@@ -202,38 +202,38 @@ define float @v_mul_neg2_f32(float %x) {
 define float @v_mul_neg1_f32(float %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg1_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg1_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg1_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg1_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg1_f32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg1_f32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -1.0
   ret float %mul
@@ -242,8 +242,8 @@ define float @v_mul_neg1_f32(float %x) {
 define float @v_mul_neg_half_f32(float %x) {
 ; GCN-LABEL: v_mul_neg_half_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -0.5, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -0.5
   ret float %mul
@@ -252,8 +252,8 @@ define float @v_mul_neg_half_f32(float %x) {
 define float @v_mul_neg_quarter_f32(float %x) {
 ; GCN-LABEL: v_mul_neg_quarter_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xbe800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -0.25
   ret float %mul
@@ -262,8 +262,8 @@ define float @v_mul_neg_quarter_f32(float %x) {
 define float @v_mul_quarter_f32(float %x) {
 ; GCN-LABEL: v_mul_quarter_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3e800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 0.25
   ret float %mul
@@ -272,8 +272,8 @@ define float @v_mul_quarter_f32(float %x) {
 define float @v_mul_half_f32(float %x) {
 ; GCN-LABEL: v_mul_half_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0.5, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 0.5
   ret float %mul
@@ -291,38 +291,38 @@ define float @v_mul_1_f32(float %x) {
 define float @v_mul_2_f32(float %x) {
 ; GFX9-SDAG-LABEL: v_mul_2_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_2_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, 2.0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_2_f32:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_2_f32:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, 2.0, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_2_f32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_2_f32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, 2.0, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 2.0
   ret float %mul
@@ -331,8 +331,8 @@ define float @v_mul_2_f32(float %x) {
 define float @v_mul_4_f32(float %x) {
 ; GCN-LABEL: v_mul_4_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 4.0
   ret float %mul
@@ -341,8 +341,8 @@ define float @v_mul_4_f32(float %x) {
 define float @v_mul_8_f32(float %x) {
 ; GCN-LABEL: v_mul_8_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x41000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 8.0
   ret float %mul
@@ -351,8 +351,8 @@ define float @v_mul_8_f32(float %x) {
 define float @v_mul_16_f32(float %x) {
 ; GCN-LABEL: v_mul_16_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x41800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 16.0
   ret float %mul
@@ -361,8 +361,8 @@ define float @v_mul_16_f32(float %x) {
 define float @v_mul_32_f32(float %x) {
 ; GCN-LABEL: v_mul_32_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x42000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 32.0
   ret float %mul
@@ -371,8 +371,8 @@ define float @v_mul_32_f32(float %x) {
 define float @v_mul_64_f32(float %x) {
 ; GCN-LABEL: v_mul_64_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x42800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 64.0
   ret float %mul
@@ -381,8 +381,8 @@ define float @v_mul_64_f32(float %x) {
 define float @v_mul_128_f32(float %x) {
 ; GCN-LABEL: v_mul_128_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x43000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 128.0
   ret float %mul
@@ -391,8 +391,8 @@ define float @v_mul_128_f32(float %x) {
 define float @v_mul_256_f32(float %x) {
 ; GCN-LABEL: v_mul_256_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x43800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 256.0
   ret float %mul
@@ -402,8 +402,8 @@ define float @v_mul_256_f32(float %x) {
 define float @v_mul_0x1p63_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1p63_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 9223372036854775808.0
   ret float %mul
@@ -413,8 +413,8 @@ define float @v_mul_0x1p63_f32(float %x) {
 define float @v_mul_0x1p64_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1p64_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f800000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 18446744073709551616.0
   ret float %mul
@@ -424,8 +424,8 @@ define float @v_mul_0x1p64_f32(float %x) {
 define float @v_mul_0x1p65_f32(float %x) {
 ; GCN-LABEL: v_mul_0x1p65_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x60000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, 36893488147419103232.0
   ret float %mul
@@ -435,22 +435,22 @@ define float @v_mul_0x1p65_f32(float %x) {
 define float @v_fma_mul_add_32_f32(float %x, float %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_32_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_32_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_32_f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fmamk_f32 v0, v0, 0x42000000, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %x, 32.0
   %fma = fadd contract float %mul, %y
@@ -460,22 +460,22 @@ define float @v_fma_mul_add_32_f32(float %x, float %y) {
 define float @v_fma_mul_sub_32_f32(float %x, float %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, v0, s4, -v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, v0, v2, -v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_sub_32_f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f32 v0, 0x42000000, v0, -v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %x, 32.0
   %fma = fsub contract float %mul, %y
@@ -485,22 +485,22 @@ define float @v_fma_mul_sub_32_f32(float %x, float %y) {
 define float @v_fma_mul_add_neg32_f32(float %x, float %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xc2000000
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2000000
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_neg32_f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fmamk_f32 v0, v0, 0xc2000000, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract float %x, -32.0
   %fma = fadd contract float %mul, %y
@@ -510,22 +510,22 @@ define float @v_fma_mul_add_neg32_f32(float %x, float %y) {
 define float @v_mul_fabs_32_f32(float %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_32_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
 ; GFX9-SDAG-NEXT:    v_mul_f32_e64 v0, |v0|, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_32_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX9-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_fabs_32_f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_mul_f32_e64 v0, 0x42000000, |v0|
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %mul = fmul float %x.fabs, 32.0
@@ -535,22 +535,22 @@ define float @v_mul_fabs_32_f32(float %x) {
 define float @v_mul_add_fma_fabs_32_f32(float %x, float %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, |v0|, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, |v0|, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_add_fma_fabs_32_f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f32 v0, 0x42000000, |v0|, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %mul = fmul contract float %x.fabs, 32.0
@@ -561,22 +561,22 @@ define float @v_mul_add_fma_fabs_32_f32(float %x, float %y) {
 define <2 x float> @v_mul_16_v2f32(<2 x float> %x) {
 ; GFX9-LABEL: v_mul_16_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x41800000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x41800000, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_16_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41800000, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_16_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41800000, v0 :: v_dual_mul_f32 v1, 0x41800000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x float> %x, <float 16.0, float 16.0>
   ret <2 x float> %mul
@@ -585,22 +585,22 @@ define <2 x float> @v_mul_16_v2f32(<2 x float> %x) {
 define <2 x float> @v_mul_neg16_v2f32(<2 x float> %x) {
 ; GFX9-LABEL: v_mul_neg16_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0xc1800000, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xc1800000, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_neg16_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0xc1800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0xc1800000, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_neg16_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, 0xc1800000, v0 :: v_dual_mul_f32 v1, 0xc1800000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x float> %x, <float -16.0, float -16.0>
   ret <2 x float> %mul
@@ -609,25 +609,25 @@ define <2 x float> @v_mul_neg16_v2f32(<2 x float> %x) {
 define <2 x float> @v_mul_fabs_16_v2f32(<2 x float> %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_16_v2f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x41800000
 ; GFX9-SDAG-NEXT:    v_mul_f32_e64 v0, |v0|, s4
 ; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v1|, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_16_v2f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX9-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v2
 ; GFX9-GISEL-NEXT:    v_mul_f32_e64 v1, |v1|, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_fabs_16_v2f32:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_mul_f32_e64 v0, 0x41800000, |v0|
 ; GFX1011-NEXT:    v_mul_f32_e64 v1, 0x41800000, |v1|
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
   %mul = fmul <2 x float> %x.fabs, <float 16.0, float 16.0>
@@ -637,31 +637,31 @@ define <2 x float> @v_mul_fabs_16_v2f32(<2 x float> %x) {
 define <2 x float> @v_fma_mul_add_32_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_32_v2f32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
 ; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_32_v2f32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, v0, v4, v2
 ; GFX9-GISEL-NEXT:    v_fma_f32 v1, v1, v4, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_mul_add_32_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmamk_f32 v0, v0, 0x42000000, v2
 ; GFX10-NEXT:    v_fmamk_f32 v1, v1, 0x42000000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_mul_add_32_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_fmamk_f32 v0, v0, 0x42000000, v2 :: v_dual_fmamk_f32 v1, v1, 0x42000000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x float> %x, <float 32.0, float 32.0>
   %fma = fadd contract <2 x float> %mul, %y
@@ -753,41 +753,41 @@ define amdgpu_ps i32 @s_mul_fma_32_f32(float inreg %x, float inreg %y) {
 define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn1031_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xfbf9
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1031_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffbf9
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1031_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffbf9
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.34584737989687770135e-311
   ret double %mul
@@ -797,41 +797,41 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn1022_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xfc02
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x100000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1022_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffc02
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1022_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffc02
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.22507385850720138309e-308
   ret double %mul
@@ -841,41 +841,41 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn1021_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xfc03
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x200000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1021_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffc03
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1021_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xfffffc03
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.45014771701440276618e-308
   ret double %mul
@@ -885,41 +885,41 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn64_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffc0
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3bf00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn64_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xffffffc0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn64_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xffffffc0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 5.42101086242752217004e-20
   ret double %mul
@@ -929,41 +929,41 @@ define double @v_mul_0x1pn64_f64(double %x) {
 define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn17_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xffef
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ee00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn17_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xffffffef
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn17_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0xffffffef
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.00000762939453125
   ret double %mul
@@ -973,40 +973,40 @@ define double @v_mul_0x1pn17_f64(double %x) {
 define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn16_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -16
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ef00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn16_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn16_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -16
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.0000152587890625
   ret double %mul
@@ -1016,40 +1016,40 @@ define double @v_mul_0x1pn16_f64(double %x) {
 define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1pn15_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -15
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0.5
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn15_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -15
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn15_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -15
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.000030517578125
   ret double %mul
@@ -1058,40 +1058,40 @@ define double @v_mul_0x1pn15_f64(double %x) {
 define double @v_mul_neg256_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg256_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg256_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 8
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg256_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 8
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -256.0
   ret double %mul
@@ -1100,40 +1100,40 @@ define double @v_mul_neg256_f64(double %x) {
 define double @v_mul_neg128_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg128_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0600000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg128_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 7
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg128_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 7
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -128.0
   ret double %mul
@@ -1142,40 +1142,40 @@ define double @v_mul_neg128_f64(double %x) {
 define double @v_mul_neg64_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg64_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg64_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 6
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg64_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 6
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -64.0
   ret double %mul
@@ -1184,40 +1184,40 @@ define double @v_mul_neg64_f64(double %x) {
 define double @v_mul_neg32_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg32_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg32_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 5
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -32.0
   ret double %mul
@@ -1226,40 +1226,40 @@ define double @v_mul_neg32_f64(double %x) {
 define double @v_mul_neg16_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg16_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0300000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg16_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -16.0
   ret double %mul
@@ -1268,40 +1268,40 @@ define double @v_mul_neg16_f64(double %x) {
 define double @v_mul_neg8_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg8_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg8_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg8_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -8.0
   ret double %mul
@@ -1310,8 +1310,8 @@ define double @v_mul_neg8_f64(double %x) {
 define double @v_mul_neg4_f64(double %x) {
 ; GCN-LABEL: v_mul_neg4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], -4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -4.0
   ret double %mul
@@ -1320,8 +1320,8 @@ define double @v_mul_neg4_f64(double %x) {
 define double @v_mul_neg2_f64(double %x) {
 ; GCN-LABEL: v_mul_neg2_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], -2.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -2.0
   ret double %mul
@@ -1330,8 +1330,8 @@ define double @v_mul_neg2_f64(double %x) {
 define double @v_mul_0_f64(double %x) {
 ; GCN-LABEL: v_mul_0_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.0
   ret double %mul
@@ -1340,38 +1340,38 @@ define double @v_mul_0_f64(double %x) {
 define double @v_mul_neg1_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg1_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg1_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], -1.0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg1_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg1_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], -1.0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg1_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg1_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], -1.0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -1.0
   ret double %mul
@@ -1380,8 +1380,8 @@ define double @v_mul_neg1_f64(double %x) {
 define double @v_mul_neg_half_f64(double %x) {
 ; GCN-LABEL: v_mul_neg_half_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], -0.5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -0.5
   ret double %mul
@@ -1390,40 +1390,40 @@ define double @v_mul_neg_half_f64(double %x) {
 define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg_quarter_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], -2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg_quarter_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], -2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg_quarter_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], -2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -0.25
   ret double %mul
@@ -1432,40 +1432,40 @@ define double @v_mul_neg_quarter_f64(double %x) {
 define double @v_mul_quarter_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_quarter_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_quarter_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_quarter_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], -2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.25
   ret double %mul
@@ -1474,8 +1474,8 @@ define double @v_mul_quarter_f64(double %x) {
 define double @v_mul_half_f64(double %x) {
 ; GCN-LABEL: v_mul_half_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], 0.5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.5
   ret double %mul
@@ -1493,38 +1493,38 @@ define double @v_mul_1_f64(double %x) {
 define double @v_mul_2_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_2_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_2_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_2_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_2_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_2_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_2_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.0
   ret double %mul
@@ -1533,8 +1533,8 @@ define double @v_mul_2_f64(double %x) {
 define double @v_mul_4_f64(double %x) {
 ; GCN-LABEL: v_mul_4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.0
   ret double %mul
@@ -1543,40 +1543,40 @@ define double @v_mul_4_f64(double %x) {
 define double @v_mul_8_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_8_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_8_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_8_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.0
   ret double %mul
@@ -1585,40 +1585,40 @@ define double @v_mul_8_f64(double %x) {
 define double @v_mul_16_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_16_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_16_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40300000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_16_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_16_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_16_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 16.0
   ret double %mul
@@ -1627,40 +1627,40 @@ define double @v_mul_16_f64(double %x) {
 define double @v_mul_32_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_32_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_32_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
   ret double %mul
@@ -1669,40 +1669,40 @@ define double @v_mul_32_f64(double %x) {
 define double @v_mul_64_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_64_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 6
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_64_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40500000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_64_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 6
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_64_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_64_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 6
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_64_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 64.0
   ret double %mul
@@ -1711,40 +1711,40 @@ define double @v_mul_64_f64(double %x) {
 define double @v_mul_128_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_128_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 7
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_128_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40600000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_128_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 7
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_128_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_128_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 7
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_128_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 128.0
   ret double %mul
@@ -1753,40 +1753,40 @@ define double @v_mul_128_f64(double %x) {
 define double @v_mul_256_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_256_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_256_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40700000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_256_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 8
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_256_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_256_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 8
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_256_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 256.0
   ret double %mul
@@ -1796,40 +1796,40 @@ define double @v_mul_256_f64(double %x) {
 define double @v_mul_0x1p63_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p63_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 63
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43e00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p63_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 63
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p63_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 63
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 9223372036854775808.0
   ret double %mul
@@ -1839,40 +1839,40 @@ define double @v_mul_0x1p63_f64(double %x) {
 define double @v_mul_0x1p64_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p64_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p64_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p64_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 18446744073709551616.0
   ret double %mul
@@ -1882,41 +1882,41 @@ define double @v_mul_0x1p64_f64(double %x) {
 define double @v_mul_0x1p65_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p65_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x41
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 34
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p65_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x41
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p65_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x41
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 36893488147419103232.0
   ret double %mul
@@ -1982,41 +1982,41 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 define double @v_mul_0x1p128_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p128_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x80
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x47f00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p128_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x80
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p128_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x80
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 3.40282366920938463463e+38
   ret double %mul
@@ -2026,41 +2026,41 @@ define double @v_mul_0x1p128_f64(double %x) {
 define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p1022_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x3fe
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fd00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1022_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x3fe
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1022_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x3fe
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.49423283715578976932e+307
   ret double %mul
@@ -2070,41 +2070,41 @@ define double @v_mul_0x1p1022_f64(double %x) {
 define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_0x1p1023_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x3ff
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fe00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1023_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x3ff
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1023_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 0x3ff
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.98846567431157953865e+307
   ret double %mul
@@ -2114,24 +2114,24 @@ define double @v_mul_0x1p1023_f64(double %x) {
 define double @v_fma_mul_add_32_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_32_f64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fadd contract double %mul, %y
@@ -2141,39 +2141,39 @@ define double @v_fma_mul_add_32_f64(double %x, double %y) {
 define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX9-LABEL: v_fma_mul_add_32_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
 ; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5]
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 32.0, double 32.0>
   %fma = fadd contract <2 x double> %mul, %y
@@ -2183,9 +2183,9 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) {
 define <2 x double> @v_fma_mul_add_2_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GCN-LABEL: v_fma_mul_add_2_v2f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], 2.0, v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], 2.0, v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 2.0, double 2.0>
   %fma = fadd contract <2 x double> %mul, %y
@@ -2195,9 +2195,9 @@ define <2 x double> @v_fma_mul_add_2_v2f64(<2 x double> %x, <2 x double> %y) {
 define <2 x double> @v_fma_mul_add_4_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GCN-LABEL: v_fma_mul_add_4_v2f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], 4.0, v[4:5]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], 4.0, v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 4.0, double 4.0>
   %fma = fadd contract <2 x double> %mul, %y
@@ -2207,8 +2207,8 @@ define <2 x double> @v_fma_mul_add_4_v2f64(<2 x double> %x, <2 x double> %y) {
 define double @v_fma_mul_add_2_f64(double %x, double %y) {
 ; GCN-LABEL: v_fma_mul_add_2_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], 2.0, v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 2.0
   %fma = fadd contract double %mul, %y
@@ -2218,8 +2218,8 @@ define double @v_fma_mul_add_2_f64(double %x, double %y) {
 define double @v_fma_mul_add_4_f64(double %x, double %y) {
 ; GCN-LABEL: v_fma_mul_add_4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], 4.0, v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 4.0
   %fma = fadd contract double %mul, %y
@@ -2229,8 +2229,8 @@ define double @v_fma_mul_add_4_f64(double %x, double %y) {
 define double @v_fma_mul_add_neg4_f64(double %x, double %y) {
 ; GCN-LABEL: v_fma_mul_add_neg4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], -4.0, v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, -4.0
   %fma = fadd contract double %mul, %y
@@ -2240,46 +2240,46 @@ define double @v_fma_mul_add_neg4_f64(double %x, double %y) {
 define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_add_32_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_add_32_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
   %fma = fadd double %mul, %y
@@ -2289,44 +2289,44 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 define double @v_mul_add_2_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_2_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_2_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_add_2_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_add_2_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_add_2_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_add_2_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.0
   %fma = fadd double %mul, %y
@@ -2336,44 +2336,44 @@ define double @v_mul_add_2_f64(double %x, double %y) {
 define double @v_mul_add_4_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_4_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_4_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_add_4_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_add_4_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_add_4_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[0:1]
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_add_4_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 2.0
 ; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.0
   %fma = fadd double %mul, %y
@@ -2383,24 +2383,24 @@ define double @v_mul_add_4_f64(double %x, double %y) {
 define double @v_fma_mul_sub_32_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], -v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_sub_32_f64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], -v[2:3]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fsub contract double %mul, %y
@@ -2410,24 +2410,24 @@ define double @v_fma_mul_sub_32_f64(double %x, double %y) {
 define double @v_fma_mul_add_neg32_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc0400000
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0400000
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_neg32_f64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f64 v[0:1], 0xc0400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, -32.0
   %fma = fadd contract double %mul, %y
@@ -2437,40 +2437,40 @@ define double @v_fma_mul_add_neg32_f64(double %x, double %y) {
 define double @v_mul_fabs_32_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_32_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_32_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 5
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %x.fabs, 32.0
@@ -2480,24 +2480,24 @@ define double @v_mul_fabs_32_f64(double %x) {
 define double @v_mul_add_fma_fabs_32_f64(double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_add_fma_fabs_32_f64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, |v[0:1]|, v[2:3]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul contract double %x.fabs, 32.0
@@ -2508,46 +2508,46 @@ define double @v_mul_add_fma_fabs_32_f64(double %x, double %y) {
 define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX9-SDAG-LABEL: v_mul_16_v2f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_16_v2f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_v2f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 4
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 4
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double 16.0, double 16.0>
   ret <2 x double> %mul
@@ -2556,46 +2556,46 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg16_v2f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], 4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg16_v2f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], 4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], 4
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], 4
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double -16.0, double -16.0>
   ret <2 x double> %mul
@@ -2604,46 +2604,46 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_16_v2f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 4
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[2:3], |v[2:3]|, 4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, s[4:5]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_16_v2f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 4
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[2:3], |v[2:3]|, 4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 4
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[2:3], |v[2:3]|, 4
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %mul = fmul <2 x double> %x.fabs, <double 16.0, double 16.0>
@@ -2709,8 +2709,8 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 define half @v_mul_0x1pn23_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1pn23_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0xH0002
   ret half %mul
@@ -2720,8 +2720,8 @@ define half @v_mul_0x1pn23_f16(half %x) {
 define half @v_mul_0x1pn17_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1pn17_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x80, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.00000762939453125
   ret half %mul
@@ -2731,8 +2731,8 @@ define half @v_mul_0x1pn17_f16(half %x) {
 define half @v_mul_0x1pn16_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1pn16_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x100, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.0000152587890625
   ret half %mul
@@ -2742,8 +2742,8 @@ define half @v_mul_0x1pn16_f16(half %x) {
 define half @v_mul_0x1pn15_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1pn15_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x200, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.000030517578125
   ret half %mul
@@ -2752,8 +2752,8 @@ define half @v_mul_0x1pn15_f16(half %x) {
 define half @v_mul_neg256_f16(half %x) {
 ; GCN-LABEL: v_mul_neg256_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -256.0
   ret half %mul
@@ -2762,8 +2762,8 @@ define half @v_mul_neg256_f16(half %x) {
 define half @v_mul_neg128_f16(half %x) {
 ; GCN-LABEL: v_mul_neg128_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -128.0
   ret half %mul
@@ -2772,8 +2772,8 @@ define half @v_mul_neg128_f16(half %x) {
 define half @v_mul_neg64_f16(half %x) {
 ; GCN-LABEL: v_mul_neg64_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -64.0
   ret half %mul
@@ -2782,8 +2782,8 @@ define half @v_mul_neg64_f16(half %x) {
 define half @v_mul_neg32_f16(half %x) {
 ; GCN-LABEL: v_mul_neg32_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -32.0
   ret half %mul
@@ -2792,8 +2792,8 @@ define half @v_mul_neg32_f16(half %x) {
 define half @v_mul_neg16_f16(half %x) {
 ; GCN-LABEL: v_mul_neg16_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -16.0
   ret half %mul
@@ -2802,8 +2802,8 @@ define half @v_mul_neg16_f16(half %x) {
 define half @v_mul_neg8_f16(half %x) {
 ; GCN-LABEL: v_mul_neg8_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -8.0
   ret half %mul
@@ -2812,8 +2812,8 @@ define half @v_mul_neg8_f16(half %x) {
 define half @v_mul_neg4_f16(half %x) {
 ; GCN-LABEL: v_mul_neg4_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -4.0
   ret half %mul
@@ -2822,8 +2822,8 @@ define half @v_mul_neg4_f16(half %x) {
 define half @v_mul_neg2_f16(half %x) {
 ; GCN-LABEL: v_mul_neg2_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, -2.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -2.0
   ret half %mul
@@ -2832,38 +2832,38 @@ define half @v_mul_neg2_f16(half %x) {
 define half @v_mul_neg1_f16(half %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg1_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg1_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, -1.0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg1_f16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_neg1_f16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, -1.0, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg1_f16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_neg1_f16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, -1.0, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -1.0
   ret half %mul
@@ -2872,8 +2872,8 @@ define half @v_mul_neg1_f16(half %x) {
 define half @v_mul_neg_half_f16(half %x) {
 ; GCN-LABEL: v_mul_neg_half_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, -0.5, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -0.5
   ret half %mul
@@ -2882,8 +2882,8 @@ define half @v_mul_neg_half_f16(half %x) {
 define half @v_mul_neg_quarter_f16(half %x) {
 ; GCN-LABEL: v_mul_neg_quarter_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -0.25
   ret half %mul
@@ -2892,8 +2892,8 @@ define half @v_mul_neg_quarter_f16(half %x) {
 define half @v_mul_quarter_f16(half %x) {
 ; GCN-LABEL: v_mul_quarter_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.25
   ret half %mul
@@ -2902,8 +2902,8 @@ define half @v_mul_quarter_f16(half %x) {
 define half @v_mul_half_f16(half %x) {
 ; GCN-LABEL: v_mul_half_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.5
   ret half %mul
@@ -2921,38 +2921,38 @@ define half @v_mul_1_f16(half %x) {
 define half @v_mul_2_f16(half %x) {
 ; GFX9-SDAG-LABEL: v_mul_2_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_2_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, 2.0, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_2_f16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_2_f16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, 2.0, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_2_f16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_2_f16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 2.0
   ret half %mul
@@ -2961,8 +2961,8 @@ define half @v_mul_2_f16(half %x) {
 define half @v_mul_4_f16(half %x) {
 ; GCN-LABEL: v_mul_4_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 4.0
   ret half %mul
@@ -2971,8 +2971,8 @@ define half @v_mul_4_f16(half %x) {
 define half @v_mul_8_f16(half %x) {
 ; GCN-LABEL: v_mul_8_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 8.0
   ret half %mul
@@ -2981,8 +2981,8 @@ define half @v_mul_8_f16(half %x) {
 define half @v_mul_16_f16(half %x) {
 ; GCN-LABEL: v_mul_16_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 16.0
   ret half %mul
@@ -2991,8 +2991,8 @@ define half @v_mul_16_f16(half %x) {
 define half @v_mul_32_f16(half %x) {
 ; GCN-LABEL: v_mul_32_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 32.0
   ret half %mul
@@ -3001,8 +3001,8 @@ define half @v_mul_32_f16(half %x) {
 define half @v_mul_64_f16(half %x) {
 ; GCN-LABEL: v_mul_64_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 64.0
   ret half %mul
@@ -3011,8 +3011,8 @@ define half @v_mul_64_f16(half %x) {
 define half @v_mul_128_f16(half %x) {
 ; GCN-LABEL: v_mul_128_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 128.0
   ret half %mul
@@ -3021,8 +3021,8 @@ define half @v_mul_128_f16(half %x) {
 define half @v_mul_256_f16(half %x) {
 ; GCN-LABEL: v_mul_256_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 256.0
   ret half %mul
@@ -3032,8 +3032,8 @@ define half @v_mul_256_f16(half %x) {
 define half @v_mul_0x1p15_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1p15_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 32768.0
   ret half %mul
@@ -3043,8 +3043,8 @@ define half @v_mul_0x1p15_f16(half %x) {
 define half @v_mul_0x1p14_f16(half %x) {
 ; GCN-LABEL: v_mul_0x1p14_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 16384.0
   ret half %mul
@@ -3054,22 +3054,22 @@ define half @v_mul_0x1p14_f16(half %x) {
 define half @v_fma_mul_add_32_f16(half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_32_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5000
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v0, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_32_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5000
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_32_f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fmamk_f16 v0, v0, 0x5000, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, 32.0
   %fma = fadd contract half %mul, %y
@@ -3079,22 +3079,22 @@ define half @v_fma_mul_add_32_f16(half %x, half %y) {
 define half @v_fma_mul_sub_32_f16(half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5000
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v0, s4, -v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5000
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, -v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_sub_32_f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f16 v0, v0, 0x5000, -v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, 32.0
   %fma = fsub contract half %mul, %y
@@ -3104,22 +3104,22 @@ define half @v_fma_mul_sub_32_f16(half %x, half %y) {
 define half @v_fma_mul_add_neg32_f16(half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xd000
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v0, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd000
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_neg32_f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fmamk_f16 v0, v0, 0xd000, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, -32.0
   %fma = fadd contract half %mul, %y
@@ -3129,22 +3129,22 @@ define half @v_fma_mul_add_neg32_f16(half %x, half %y) {
 define half @v_mul_fabs_32_f16(half %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_32_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5000
 ; GFX9-SDAG-NEXT:    v_mul_f16_e64 v0, |v0|, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_32_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x5000
 ; GFX9-GISEL-NEXT:    v_mul_f16_e64 v0, |v0|, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_fabs_32_f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_mul_f16_e64 v0, 0x5000, |v0|
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call half @llvm.fabs.f16(half %x)
   %mul = fmul half %x.fabs, 32.0
@@ -3154,22 +3154,22 @@ define half @v_mul_fabs_32_f16(half %x) {
 define half @v_mul_add_fma_fabs_32_f16(half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5000
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, |v0|, s4, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5000
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, |v0|, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_add_fma_fabs_32_f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_fma_f16 v0, |v0|, 0x5000, v1
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call half @llvm.fabs.f16(half %x)
   %mul = fmul contract half %x.fabs, 32.0
@@ -3180,22 +3180,22 @@ define half @v_mul_add_fma_fabs_32_f16(half %x, half %y) {
 define <2 x half> @v_mul_16_v2f16(<2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_16_v2f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x4c00
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_16_v2f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4c004c00
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_16_v2f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half 16.0, half 16.0>
   ret <2 x half> %mul
@@ -3204,22 +3204,22 @@ define <2 x half> @v_mul_16_v2f16(<2 x half> %x) {
 define <2 x half> @v_mul_neg16_v2f16(<2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_neg16_v2f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0xcc00
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_neg16_v2f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0xcc00cc00
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_neg16_v2f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_pk_mul_f16 v0, 0xcc00, v0 op_sel_hi:[0,1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x half> %x, <half -16.0, half -16.0>
   ret <2 x half> %mul
@@ -3228,25 +3228,25 @@ define <2 x half> @v_mul_neg16_v2f16(<2 x half> %x) {
 define <2 x half> @v_mul_fabs_16_v2f16(<2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_16_v2f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x4c00
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_16_v2f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4c004c00
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_fabs_16_v2f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX1011-NEXT:    v_pk_mul_f16 v0, 0x4c00, v0 op_sel_hi:[0,1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %mul = fmul <2 x half> %x.fabs, <half 16.0, half 16.0>
@@ -3256,22 +3256,22 @@ define <2 x half> @v_mul_fabs_16_v2f16(<2 x half> %x) {
 define <2 x half> @v_fma_mul_add_32_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_fma_mul_add_32_v2f16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x5000
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v0, s4, v1 op_sel_hi:[1,0,1]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_fma_mul_add_32_v2f16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x50005000
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v0, v2, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_fma_mul_add_32_v2f16:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_pk_fma_f16 v0, 0x5000, v0, v1 op_sel_hi:[0,1,1]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x half> %x, <half 32.0, half 32.0>
   %fma = fadd contract <2 x half> %mul, %y
@@ -3306,19 +3306,19 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
 define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
 ; GFX9-LABEL: v_mul_f32_select_64_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_f32_select_64_1:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
 ; GFX1011-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 64.0, float 1.0
@@ -3329,19 +3329,19 @@ define float @v_mul_f32_select_64_1(i32 %arg, float %x) {
 define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
 ; GFX9-LABEL: v_mul_f32_select_1_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 1.0, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_f32_select_1_64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
 ; GFX1011-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 1.0, float 64.0
@@ -3352,19 +3352,19 @@ define float @v_mul_f32_select_1_64(i32 %arg, float %x) {
 define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
 ; GFX9-LABEL: v_mul_f32_select_n1_n64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -1.0, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_f32_select_n1_n64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
 ; GFX1011-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float -1.0, float -64.0
@@ -3375,19 +3375,19 @@ define float @v_mul_f32_select_n1_n64(i32 %arg, float %x) {
 define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
 ; GFX9-LABEL: v_mul_f32_select_n64_n1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_f32_select_n64_n1:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
 ; GFX1011-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float -64.0, float -1.0
@@ -3398,58 +3398,58 @@ define float @v_mul_f32_select_n64_n1(i32 %arg, float %x) {
 define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
 ; GFX9-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x43000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43000000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x42800000, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f32_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0x43000000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x42800000, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x43000000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3460,58 +3460,58 @@ define float @v_mul_f32_select_128_64(i32 %arg, float %x) {
 define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
 ; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc3000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xc2800000, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xc2800000, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float -128.0, float -64.0
@@ -3522,58 +3522,58 @@ define float @v_mul_f32_select_n128_n64(i32 %arg, float %x) {
 define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
 ; GFX9-SDAG-LABEL: v_mul_f32_select_n128_n16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc3000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f32_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f32_select_n128_n16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xc1800000, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f32_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f32_select_n128_n16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0xc3000000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xc1800000, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f32_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc1800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xc3000000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float -128.0, float -16.0
@@ -3584,19 +3584,19 @@ define float @v_mul_f32_select_n128_n16(i32 %arg, float %x) {
 define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_64_1:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0x42800000, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float 64.0, float 1.0
@@ -3608,19 +3608,19 @@ define float @v_contract_mul_add_f32_select_64_1(i32 %arg, float %x, float %y) {
 define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 1.0, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_1_64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x42800000, 1.0, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float 1.0, float 64.0
@@ -3632,19 +3632,19 @@ define float @v_contract_mul_add_f32_select_1_64(i32 %arg, float %x, float %y) {
 define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, -1.0, v3, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_n64_n1:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, -1.0, 0xc2800000, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float -64.0, float -1.0
@@ -3656,19 +3656,19 @@ define float @v_contract_mul_add_f32_select_n64_n1(i32 %arg, float %x, float %y)
 define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc2800000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -1.0, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_n1_n64:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0xc2800000, -1.0, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, float -1.0, float -64.0
@@ -3680,58 +3680,58 @@ define float @v_contract_mul_add_f32_select_n1_n64(i32 %arg, float %x, float %y)
 define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x43000000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x42800000, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x42800000, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f32_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x43000000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 64.0
@@ -3743,19 +3743,19 @@ define float @v_contract_mul_add_f32_select_128_64(i32 %arg, float %x, float %y)
 define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_128_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 4.0, v3, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_128_4:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4.0, 0x43000000, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 128.0, float 4.0
@@ -3767,18 +3767,18 @@ define float @v_contract_mul_add_f32_select_128_4(i32 %arg, float %x, float %y)
 define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_2_4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 4.0, 2.0, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_2_4:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4.0, 2.0, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 2.0, float 4.0
@@ -3790,19 +3790,19 @@ define float @v_contract_mul_add_f32_select_2_4(i32 %arg, float %x, float %y) {
 define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y) {
 ; GFX9-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x43000000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, 4.0, vcc
 ; GFX9-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f32_select_4_128:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0x43000000, 4.0, vcc_lo
 ; GFX1011-NEXT:    v_fma_f32 v0, v1, v0, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, float 4.0, float 128.0
@@ -3814,62 +3814,62 @@ define float @v_contract_mul_add_f32_select_4_128(i32 %arg, float %x, float %y)
 define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x3ff00000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x3ff00000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40500000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 64.0, double 1.0
@@ -3880,62 +3880,62 @@ define double @v_mul_f64_select_64_1(i32 %arg, double %x) {
 define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0x3ff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x3ff00000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 1.0, double 64.0
@@ -3946,62 +3946,62 @@ define double @v_mul_f64_select_1_64(i32 %arg, double %x) {
 define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0xbff00000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xbff00000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -1.0, double -64.0
@@ -4012,62 +4012,62 @@ define double @v_mul_f64_select_n1_n64(i32 %arg, double %x) {
 define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x40600000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40600000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x40600000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0x40600000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0x40500000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x40500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0x40600000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4078,62 +4078,62 @@ define double @v_mul_f64_select_128_64(i32 %arg, double %x) {
 define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0xc0600000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0500000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0500000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -64.0
@@ -4144,62 +4144,62 @@ define double @v_mul_f64_select_n128_n64(i32 %arg, double %x) {
 define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
 ; GFX9-SDAG-LABEL: v_mul_f64_select_n128_n16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc0300000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f64_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0300000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f64_select_n128_n16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0600000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0300000, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f64_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc0300000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f64_select_n128_n16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v4, 0xc0600000 :: v_dual_mov_b32 v3, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v4, 0xc0300000, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f64_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0xc0300000 :: v_dual_mov_b32 v3, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, 0xc0600000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[1:2], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double -128.0, double -16.0
@@ -4210,62 +4210,62 @@ define double @v_mul_f64_select_n128_n16(i32 %arg, double %x) {
 define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3ff00000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x3ff00000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x3ff00000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40500000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 64.0, double 1.0
@@ -4277,62 +4277,62 @@ define double @v_contract_mul_add_f64_select_64_1(i32 %arg, double %x, double %y
 define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40500000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0x3ff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40500000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x3ff00000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double 1.0, double 64.0
@@ -4344,62 +4344,62 @@ define double @v_contract_mul_add_f64_select_1_64(i32 %arg, double %x, double %y
 define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xbff00000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0xbff00000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0xbff00000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xc0500000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -64.0, double -1.0
@@ -4411,62 +4411,62 @@ define double @v_contract_mul_add_f64_select_n64_n1(i32 %arg, double %x, double
 define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xc0500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0xc0500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0xbff00000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0xc0500000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0xc0500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0xbff00000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0xc0500000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0xc0500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0xbff00000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, double -1.0, double -64.0
@@ -4478,62 +4478,62 @@ define double @v_contract_mul_add_f64_select_n1_n64(i32 %arg, double %x, double
 define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40500000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40500000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40500000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40500000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40500000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40500000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 64.0
@@ -4545,62 +4545,62 @@ define double @v_contract_mul_add_f64_select_128_64(i32 %arg, double %x, double
 define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40100000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40100000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40100000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40600000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 128.0, double 4.0
@@ -4612,31 +4612,31 @@ define double @v_contract_mul_add_f64_select_128_4(i32 %arg, double %x, double %
 define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40100000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v5, 2.0, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 2.0, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_contract_mul_add_f64_select_2_4:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1011-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1011-NEXT:    v_cndmask_b32_e64 v6, 0x40100000, 2.0, vcc_lo
 ; GFX1011-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 2.0, double 4.0
@@ -4648,62 +4648,62 @@ define double @v_contract_mul_add_f64_select_2_4(i32 %arg, double %x, double %y)
 define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x40600000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v7, 0x40600000
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v6, 0x40100000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40600000, v6, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v6, 0x40600000
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v6, 0x40100000 :: v_dual_mov_b32 v5, 0
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v6, 0x40600000, v6, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f64_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v6, 0x40600000 :: v_dual_mov_b32 v5, 0
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, 0x40100000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[1:2], v[5:6], v[3:4]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, double 4.0, double 128.0
@@ -4715,58 +4715,58 @@ define double @v_contract_mul_add_f64_select_4_128(i32 %arg, double %x, double %
 define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 64.0, half 1.0
@@ -4777,58 +4777,58 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 1.0, half 64.0
@@ -4839,58 +4839,58 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -1.0, half -64.0
@@ -4901,58 +4901,58 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
 define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -4963,58 +4963,58 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -64.0
@@ -5025,58 +5025,58 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
 ; GFX9-SDAG-LABEL: v_mul_f16_select_n128_n16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0xcc00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_f16_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_f16_select_n128_n16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v2, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_f16_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v2, vcc_lo
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -16.0
@@ -5087,58 +5087,58 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
 define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 64.0, half 1.0
@@ -5150,58 +5150,58 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 1.0, half 64.0
@@ -5213,58 +5213,58 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -64.0, half -1.0
@@ -5276,58 +5276,58 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -1.0, half -64.0
@@ -5339,58 +5339,58 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
@@ -5402,58 +5402,58 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 4.0
@@ -5465,58 +5465,58 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 2.0, half 4.0
@@ -5528,58 +5528,58 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 4.0, half 128.0
@@ -5591,7 +5591,6 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5600,11 +5599,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5614,11 +5613,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
@@ -5626,11 +5625,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
@@ -5639,11 +5638,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
@@ -5651,11 +5650,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
@@ -5664,6 +5663,7 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -5674,7 +5674,6 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5683,11 +5682,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5697,11 +5696,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
@@ -5709,11 +5708,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
@@ -5722,11 +5721,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
@@ -5734,11 +5733,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
@@ -5747,6 +5746,7 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -5757,7 +5757,6 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5766,11 +5765,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5780,11 +5779,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
@@ -5792,11 +5791,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
@@ -5805,11 +5804,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
@@ -5817,11 +5816,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
@@ -5830,6 +5829,7 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -5840,7 +5840,6 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5849,11 +5848,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5863,11 +5862,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
@@ -5875,11 +5874,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
@@ -5888,11 +5887,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
@@ -5900,11 +5899,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
@@ -5913,6 +5912,7 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -5923,7 +5923,6 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -5932,11 +5931,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -5946,11 +5945,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
@@ -5958,11 +5957,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
@@ -5971,11 +5970,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
@@ -5983,11 +5982,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
@@ -5996,6 +5995,7 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
@@ -6006,7 +6006,6 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6015,11 +6014,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xcc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6029,11 +6028,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
@@ -6041,11 +6040,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
@@ -6054,11 +6053,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
@@ -6066,11 +6065,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
@@ -6079,6 +6078,7 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
@@ -6089,7 +6089,6 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6098,11 +6097,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6112,11 +6111,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
@@ -6124,11 +6123,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
@@ -6137,11 +6136,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
@@ -6149,11 +6148,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
@@ -6162,6 +6161,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
@@ -6173,7 +6173,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6182,11 +6181,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6196,11 +6195,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
@@ -6208,11 +6207,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
@@ -6221,11 +6220,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
@@ -6233,11 +6232,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
@@ -6246,6 +6245,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
@@ -6257,7 +6257,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6266,11 +6265,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6280,11 +6279,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
@@ -6292,11 +6291,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
@@ -6305,11 +6304,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
@@ -6317,11 +6316,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
@@ -6330,6 +6329,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
@@ -6341,7 +6341,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6350,11 +6349,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6364,11 +6363,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
@@ -6376,11 +6375,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
@@ -6389,11 +6388,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
@@ -6401,11 +6400,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
@@ -6414,6 +6413,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
@@ -6425,7 +6425,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6434,11 +6433,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6448,11 +6447,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
@@ -6460,11 +6459,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
@@ -6473,11 +6472,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
@@ -6485,11 +6484,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
@@ -6498,6 +6497,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
@@ -6509,7 +6509,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6518,11 +6517,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6532,11 +6531,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
@@ -6544,11 +6543,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
@@ -6557,11 +6556,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
@@ -6569,11 +6568,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
@@ -6582,6 +6581,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
@@ -6593,7 +6593,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6602,11 +6601,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6616,11 +6615,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
@@ -6628,11 +6627,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
@@ -6641,11 +6640,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
@@ -6653,11 +6652,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
@@ -6666,6 +6665,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
@@ -6677,7 +6677,6 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x half> %x, <2 x half> %y) {
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -6686,11 +6685,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -6700,11 +6699,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
@@ -6712,11 +6711,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
 ; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
@@ -6725,11 +6724,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
@@ -6737,11 +6736,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
@@ -6750,6 +6749,7 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
@@ -6765,8 +6765,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 define float @v_constrained_fmul_32_f32(float %x, float %y) #0 {
 ; GCN-LABEL: v_constrained_fmul_32_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x42000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret float %val
@@ -6775,40 +6775,40 @@ define float @v_constrained_fmul_32_f32(float %x, float %y) #0 {
 define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX9-SDAG-LABEL: v_constrained_fmul_32_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_32_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_32_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 5
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6817,40 +6817,40 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX9-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 64
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 18446744073709551616.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6859,8 +6859,8 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 define half @v_constrained_fmul_32_f16(half %x, half %y) #0 {
 ; GCN-LABEL: v_constrained_fmul_32_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %val
@@ -6869,41 +6869,41 @@ define half @v_constrained_fmul_32_f16(half %x, half %y) #0 {
 define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0xfbf9
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 0xfffffbf9
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 0xfffffbf9
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 4.34584737989687770135e-311
@@ -6913,40 +6913,40 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_neg256_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg256_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 8
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg256_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 8
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -256.0
@@ -6956,40 +6956,40 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_neg8_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg8_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg8_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, 3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -8.0
@@ -6999,8 +6999,8 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 define double @v_mul_fabs_neg4_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_neg4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -4.0
@@ -7010,8 +7010,8 @@ define double @v_mul_fabs_neg4_f64(double %x) {
 define double @v_mul_fabs_neg2_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_neg2_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -2.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -2.0
@@ -7021,38 +7021,38 @@ define double @v_mul_fabs_neg2_f64(double %x) {
 define double @v_mul_fabs_neg1_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_neg1_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg1_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -1.0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg1_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg1_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -1.0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg1_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg1_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -1.0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -1.0
@@ -7062,8 +7062,8 @@ define double @v_mul_fabs_neg1_f64(double %x) {
 define double @v_mul_fabs_neghalf_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_neghalf_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, -0.5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -0.5
@@ -7073,40 +7073,40 @@ define double @v_mul_fabs_neghalf_f64(double %x) {
 define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, -2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, -2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, -2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -0.25
@@ -7116,40 +7116,40 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_quarter_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, -2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_quarter_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, -2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_quarter_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, -2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 0.25
@@ -7159,8 +7159,8 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 define double @v_mul_fabs_half_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_half_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0.5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 0.5
@@ -7170,8 +7170,8 @@ define double @v_mul_fabs_half_f64(double %x) {
 define double @v_mul_fabs_1_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_1_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 1.0
@@ -7181,38 +7181,38 @@ define double @v_mul_fabs_1_f64(double %x) {
 define double @v_mul_fabs_2_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_2_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_2_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 2.0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_2_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_2_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 2.0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_2_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_2_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 2.0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 2.0
@@ -7222,8 +7222,8 @@ define double @v_mul_fabs_2_f64(double %x) {
 define double @v_mul_fabs_4_f64(double %x) {
 ; GCN-LABEL: v_mul_fabs_4_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 4.0
@@ -7233,40 +7233,40 @@ define double @v_mul_fabs_4_f64(double %x) {
 define double @v_mul_fabs_8_f64(double %x) {
 ; GFX9-SDAG-LABEL: v_mul_fabs_8_f64:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_8_f64:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_8_f64:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, 3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 8.0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 592be64e1e31429..94b0a0b74c04ab2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -15,50 +15,50 @@
 define half @v_fneg_add_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e64 v0, -v0, v1
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e64 v0, -v0, v1
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd half %a, %b
   %fneg = fneg half %add
@@ -68,28 +68,28 @@ define half @v_fneg_add_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_add_store_use_add_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, v0, v1
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_add_store_use_add_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_f16_e32 v1, v0, v1
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_add_store_use_add_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v1, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd half %a, %b
   %fneg = fneg half %add
@@ -101,7 +101,6 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -109,49 +108,50 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
 ; SI-SAFE-NEXT:    v_add_f32_e32 v1, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; SI-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f16_e32 v1, v0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v1
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e64 v0, -v0, v1
 ; VI-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v1, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v1
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e64 v0, -v0, v1
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd half %a, %b
   %fneg = fneg half %add
@@ -165,50 +165,50 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
 define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_fneg_x_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v0, v0, v1
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_fneg_x_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_fneg_x_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %add = fadd half %fneg.a, %b
@@ -219,50 +219,50 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 {
 define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v1, v0
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_x_fneg_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v0, v1, v0
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_x_fneg_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_x_fneg_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v0, v1, v0
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg half %b
   %add = fadd half %a, %fneg.b
@@ -273,50 +273,50 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 {
 define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_sub_f16_e64 v0, -v0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_add_f16_e32 v0, v0, v1
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e64 v0, -v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fneg.b = fneg half %b
@@ -328,7 +328,6 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -336,53 +335,54 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; SI-SAFE-NEXT:    v_sub_f32_e32 v0, v3, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v3, v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; VI-SAFE-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v1, v2
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v1, v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v1, v1, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; GFX11-NSZ-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %add = fadd half %fneg.a, %b
@@ -395,7 +395,6 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -406,11 +405,11 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
 ; SI-SAFE-NEXT:    v_sub_f32_e32 v0, v1, v3
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; SI-SAFE-NEXT:    v_mul_f32_e32 v1, v4, v2
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -420,42 +419,43 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v4, -v0
 ; SI-NSZ-NEXT:    v_sub_f32_e32 v0, v3, v1
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, v4, v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_sub_f16_e32 v1, v1, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v0, v3
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v3, v0, v1
 ; VI-NSZ-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v1, v1, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v3, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v3, v0, v1
 ; GFX11-NSZ-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %add = fadd half %fneg.a, %b
@@ -687,24 +687,24 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
 define half @v_fneg_mul_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %a, %b
   %fneg = fneg half %mul
@@ -714,28 +714,28 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_store_use_mul_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, v0, v1
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_store_use_mul_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v1, v0, v1
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_store_use_mul_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %a, %b
   %fneg = fneg half %mul
@@ -747,28 +747,28 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_multi_use_mul_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_multi_use_mul_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; VI-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_multi_use_mul_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %a, %b
   %fneg = fneg half %mul
@@ -781,24 +781,24 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 {
 define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_fneg_x_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_fneg_x_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_fneg_x_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %mul = fmul half %fneg.a, %b
@@ -809,24 +809,24 @@ define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 {
 define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_x_fneg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_x_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_x_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg half %b
   %mul = fmul half %a, %fneg.b
@@ -837,24 +837,24 @@ define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 {
 define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_fneg_fneg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fneg.b = fneg half %b
@@ -866,30 +866,30 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v3, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_store_use_fneg_x_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %mul = fmul half %fneg.a, %b
@@ -902,7 +902,6 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 {
 define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 {
 ; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -912,23 +911,24 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c
 ; SI-NEXT:    v_cvt_f32_f16_e64 v4, -v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v3, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, v4, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v3, v0, v1
 ; VI-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %mul = fmul half %fneg.a, %b
@@ -946,7 +946,6 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c
 define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -954,23 +953,24 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %fneg = fneg half %min
@@ -980,24 +980,24 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
 define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %fneg = fneg half %min
@@ -1007,20 +1007,20 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 {
 define half @v_fneg_self_minnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_self_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_self_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_self_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %a)
   %min.fneg = fneg half %min
@@ -1030,20 +1030,20 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 {
 define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_self_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_self_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_self_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %a)
   %min.fneg = fneg half %min
@@ -1053,26 +1053,26 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_posk_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_posk_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, -4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_posk_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, -4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 4.0, half %a)
   %fneg = fneg half %min
@@ -1082,22 +1082,22 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
 define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -4.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_posk_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -4.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 4.0, half %a)
   %fneg = fneg half %min
@@ -1107,26 +1107,26 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_negk_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_negk_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_negk_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half -4.0, half %a)
   %fneg = fneg half %min
@@ -1136,22 +1136,22 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
 define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, 4.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_negk_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, 4.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half -4.0, half %a)
   %fneg = fneg half %min
@@ -1161,26 +1161,26 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_0_minnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_0_minnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_minnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_minnum_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call nnan half @llvm.minnum.f16(half 0.0, half %a)
   %fneg = fneg half %min
@@ -1190,26 +1190,26 @@ define half @v_fneg_0_minnum_f16(half %a) #0 {
 define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_neg0_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg0_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, 0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_neg0_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half -0.0, half %a)
   %fneg = fneg half %min
@@ -1219,28 +1219,28 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
 define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_inv2pi_minnum_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
   %fneg = fneg half %min
@@ -1250,28 +1250,28 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_neg_inv2pi_minnum_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
   %fneg = fneg half %min
@@ -1281,22 +1281,22 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, 0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half -0.0, half %a)
   %fneg = fneg half %min
@@ -1306,7 +1306,6 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1314,23 +1313,24 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0.0, half %a)
   %fneg = fneg half %min
@@ -1341,7 +1341,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1349,23 +1348,24 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
   %fneg = fneg half %min
@@ -1376,28 +1376,28 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
 define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0.0, half %a)
   %fneg = fneg half %min
@@ -1408,7 +1408,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
 define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
@@ -1417,25 +1416,26 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %fneg = fneg half %min
@@ -1448,31 +1448,31 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
 define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xc400
 ; VI-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e32 v1, 4.0, v0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %fneg = fneg half %min
@@ -1489,7 +1489,6 @@ define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b)
 define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -1497,23 +1496,24 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %fneg = fneg half %max
@@ -1523,24 +1523,24 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
 define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e64 v0, -v0, -v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e64 v0, -v0, -v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %fneg = fneg half %max
@@ -1550,20 +1550,20 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 {
 define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_self_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_self_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_self_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %a)
   %max.fneg = fneg half %max
@@ -1573,20 +1573,20 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 {
 define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_self_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %a)
   %max.fneg = fneg half %max
@@ -1596,26 +1596,26 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_posk_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_posk_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_min_f16_e32 v0, -4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_posk_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, -4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half 4.0, half %a)
   %fneg = fneg half %max
@@ -1625,22 +1625,22 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
 define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e64 v0, -v0, -4.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e64 v0, -v0, -4.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half 4.0, half %a)
   %fneg = fneg half %max
@@ -1650,26 +1650,26 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_negk_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_negk_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_negk_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half -4.0, half %a)
   %fneg = fneg half %max
@@ -1679,22 +1679,22 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
 define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e64 v0, -v0, 4.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e64 v0, -v0, 4.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half -4.0, half %a)
   %fneg = fneg half %max
@@ -1704,26 +1704,26 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_0_maxnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_0_maxnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_maxnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_maxnum_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call nnan half @llvm.maxnum.f16(half 0.0, half %a)
   %fneg = fneg half %max
@@ -1733,26 +1733,26 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 {
 define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
 ; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_neg0_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, 0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half -0.0, half %a)
   %fneg = fneg half %max
@@ -1762,22 +1762,22 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
 define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 {
 ; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e64 v0, -v0, 0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_f16_e64 v0, -v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half -0.0, half %a)
   %fneg = fneg half %max
@@ -1787,7 +1787,6 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 {
 define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1795,23 +1794,24 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_max_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half 0.0, half %a)
   %fneg = fneg half %max
@@ -1822,28 +1822,28 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, 0, v0
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half 0.0, half %a)
   %fneg = fneg half %max
@@ -1854,7 +1854,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
 define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) #0 {
 ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
@@ -1863,25 +1862,26 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_min_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %fneg = fneg half %max
@@ -1894,31 +1894,31 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
 define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) #4 {
 ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_min_f16_e64 v0, -v0, -v1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xc400
 ; VI-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e32 v1, 4.0, v0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %fneg = fneg half %max
@@ -1935,7 +1935,6 @@ define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b)
 define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1944,11 +1943,11 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1956,33 +1955,34 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
   %fneg = fneg half %fma
@@ -1992,7 +1992,6 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
 define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #0 {
 ; SI-LABEL: v_fneg_fma_store_use_fma_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2001,21 +2000,22 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_fma_f32 v1, v0, v1, v2
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fma_store_use_fma_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_fma_f16 v1, v0, v1, v2
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fma_store_use_fma_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v1, v0, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
   %fneg = fneg half %fma
@@ -2027,7 +2027,6 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #
 define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2037,11 +2036,11 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #
 ; SI-SAFE-NEXT:    v_fma_f32 v1, v0, v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; SI-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2050,38 +2049,39 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v1
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
 ; VI-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
 ; GFX11-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
   %fneg = fneg half %fma
@@ -2094,7 +2094,6 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #
 define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2103,11 +2102,11 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, -v0, v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2115,33 +2114,34 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, -v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v0, -v0, v1, v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
@@ -2152,7 +2152,6 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
 define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2161,11 +2160,11 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v0, -v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2173,33 +2172,34 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, v0, -v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v0, v0, -v1, v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg half %b
   %fma = call half @llvm.fma.f16(half %a, half %fneg.b, half %c)
@@ -2210,7 +2210,6 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
 define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2219,11 +2218,11 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2231,33 +2230,34 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fneg.b = fneg half %b
@@ -2269,7 +2269,6 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
 define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2278,11 +2277,11 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, -v0, v1, -v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2290,33 +2289,34 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, v1, v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, -v0, v1, -v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, v1, v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v0, -v0, v1, -v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fneg.c = fneg half %c
@@ -2328,7 +2328,6 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
 define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2337,11 +2336,11 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v0, v1, -v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2349,33 +2348,34 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, v0, v1, -v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v0, v0, v1, -v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.c = fneg half %c
   %fma = call half @llvm.fma.f16(half %a, half %b, half %fneg.c)
@@ -2386,7 +2386,6 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
 define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e64 v3, -v0
@@ -2396,11 +2395,11 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v3, v4, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2409,42 +2408,43 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v4, v3, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x8000, v0
 ; VI-SAFE-NEXT:    v_fma_f16 v0, -v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v1, v3
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_xor_b32_e32 v3, 0x8000, v0
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, v1, -v2
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v1, -v0, v1, v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v2, v0, v1, -v2
 ; GFX11-NSZ-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
@@ -2457,7 +2457,6 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half
 define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half %c, half %d) #0 {
 ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e64 v0, -v0
@@ -2469,11 +2468,11 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half
 ; SI-SAFE-NEXT:    v_fma_f32 v0, v4, v1, v2
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; SI-SAFE-NEXT:    v_mul_f32_e32 v1, v4, v3
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -2485,42 +2484,43 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v5, -v0
 ; SI-NSZ-NEXT:    v_fma_f32 v0, v4, v1, -v2
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, v5, v3
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, -v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mul_f16_e64 v1, -v0, v3
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v0, v2
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v2, v0, v1, -v2
 ; VI-NSZ-NEXT:    v_mul_f16_e64 v1, -v0, v3
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v1, -v0, v1, v2
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_mul_f16_e64 v1, -v0, v3
 ; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v2, v0, v1, -v2
 ; GFX11-NSZ-NEXT:    v_mul_f16_e64 v1, -v0, v3
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
@@ -2538,7 +2538,6 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half
 define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fmad_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2547,11 +2546,11 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v2
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fmad_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e64 v1, -v1
@@ -2559,33 +2558,34 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NSZ-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fmad_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fmad_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fmad_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fmad_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
   %fneg = fneg half %fma
@@ -2595,7 +2595,6 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
 define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fmad_v4f32:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v9, v9
@@ -2628,11 +2627,11 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v9
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v10
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v11
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fmad_v4f32:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v10, v10
@@ -2661,11 +2660,11 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; SI-NSZ-NEXT:    v_mad_f32 v1, v1, v5, -v9
 ; SI-NSZ-NEXT:    v_mad_f32 v2, v2, v6, -v10
 ; SI-NSZ-NEXT:    v_mad_f32 v3, v3, v7, -v11
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fmad_v4f32:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -2682,11 +2681,11 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; VI-SAFE-NEXT:    v_or_b32_e32 v1, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fmad_v4f32:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -2701,23 +2700,24 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; VI-NSZ-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; VI-NSZ-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fmad_v4f32:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fmad_v4f32:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
   %fneg = fneg <4 x half> %fma
@@ -2727,7 +2727,6 @@ define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c
 define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -2737,11 +2736,11 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c)
 ; SI-SAFE-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v2
 ; SI-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v2
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -2750,38 +2749,39 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c)
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NSZ-NEXT:    v_mad_f32 v0, v0, v1, -v2
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, v0, v1, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v1
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
 ; VI-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
 ; GFX11-SAFE-NEXT:    v_mul_f16_e32 v1, 4.0, v2
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v0, v0, -v1, -v2
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
   %fneg = fneg half %fma
@@ -2798,25 +2798,25 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c)
 define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 {
 ; SI-LABEL: v_fneg_fp_extend_f16_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_f32_e64 v[0:1], -v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_extend_f16_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_extend_f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to double
   %fneg = fneg double %fpext
@@ -2826,23 +2826,23 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 {
 define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 {
 ; SI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fpext = fpext half %fneg.a to double
@@ -2853,29 +2853,29 @@ define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 {
 define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 {
 ; SI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fpext = fpext half %fneg.a to double
@@ -2888,30 +2888,30 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0
 define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #0 {
 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
 ; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to double
   %fneg = fneg double %fpext
@@ -2923,30 +2923,30 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #
 define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(half %a) #0 {
 ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; SI-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; SI-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; VI-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to double
   %fneg = fneg double %fpext
@@ -2959,24 +2959,24 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h
 define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 {
 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to float
   %fneg = fneg float %fpext
@@ -2992,7 +2992,6 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0
 define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_f64_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -3037,11 +3036,11 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_f64_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; VI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
@@ -3085,11 +3084,11 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_round_f64_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
@@ -3134,6 +3133,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
   %fneg = fneg half %fpround
@@ -3143,7 +3143,6 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -3186,11 +3185,11 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -3232,11 +3231,11 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 20, 11
@@ -3280,6 +3279,7 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to half
@@ -3290,7 +3290,6 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v3, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 0x1ff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -3337,11 +3336,11 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v3
@@ -3387,11 +3386,11 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v2, 0x1ff, v1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_bfe_u32 v4, v1, 20, 11
@@ -3437,6 +3436,7 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to half
@@ -3449,7 +3449,6 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, double %c) #0 {
 ; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, 0x1ff, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -3495,11 +3494,11 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_mul_f64 v[1:2], -v[0:1], v[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v4, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v4, v4, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
@@ -3545,11 +3544,11 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v4, 0x1ff, v1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-NEXT:    v_bfe_u32 v6, v1, 20, 11
@@ -3594,6 +3593,7 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to half
@@ -3607,7 +3607,6 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -3652,11 +3651,11 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -3699,11 +3698,11 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 20, 11
@@ -3748,6 +3747,7 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v1, 0x8000, v1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
   %fneg = fneg half %fpround
@@ -3763,22 +3763,22 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 define half @v_fneg_trunc_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_trunc_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_trunc_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_trunc_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f16_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_trunc_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_trunc_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc = call half @llvm.trunc.f16(half %a)
   %fneg = fneg half %trunc
@@ -3792,7 +3792,6 @@ define half @v_fneg_trunc_f16(half %a) #0 {
 define half @v_fneg_round_f16(half %a) #0 {
 ; SI-SAFE-LABEL: v_fneg_round_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SAFE-NEXT:    v_trunc_f32_e32 v1, v0
@@ -3803,11 +3802,11 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; SI-SAFE-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; SI-SAFE-NEXT:    v_add_f32_e32 v0, v1, v0
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_round_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NSZ-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NSZ-NEXT:    v_trunc_f32_e32 v1, v0
@@ -3817,11 +3816,11 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; SI-NSZ-NEXT:    s_brev_b32 s4, -2
 ; SI-NSZ-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; SI-NSZ-NEXT:    v_sub_f32_e64 v0, -v1, v0
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_round_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_trunc_f16_e32 v1, v0
 ; VI-SAFE-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x3c00
@@ -3831,11 +3830,11 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; VI-SAFE-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; VI-SAFE-NEXT:    v_add_f16_e32 v0, v1, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_round_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_trunc_f16_e32 v1, v0
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x3c00
@@ -3844,11 +3843,11 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; VI-NSZ-NEXT:    s_movk_i32 s4, 0x7fff
 ; VI-NSZ-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; VI-NSZ-NEXT:    v_sub_f16_e64 v0, -v1, v0
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: v_fneg_round_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_trunc_f16_e32 v1, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v2, v0, v1
@@ -3859,11 +3858,11 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v0, v1, v0
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: v_fneg_round_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_trunc_f16_e32 v1, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v2, v0, v1
@@ -3873,6 +3872,7 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; GFX11-NSZ-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e64 v0, -v1, v0
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %round = call half @llvm.round.f16(half %a)
   %fneg = fneg half %round
@@ -3886,22 +3886,22 @@ define half @v_fneg_round_f16(half %a) #0 {
 define half @v_fneg_rint_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_rint_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_rndne_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_rint_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rndne_f16_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_rint_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rndne_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %rint = call half @llvm.rint.f16(half %a)
   %fneg = fneg half %rint
@@ -3915,22 +3915,22 @@ define half @v_fneg_rint_f16(half %a) #0 {
 define half @v_fneg_nearbyint_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_nearbyint_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_rndne_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_nearbyint_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rndne_f16_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_nearbyint_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rndne_f16_e64 v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %nearbyint = call half @llvm.nearbyint.f16(half %a)
   %fneg = fneg half %nearbyint
@@ -3944,28 +3944,28 @@ define half @v_fneg_nearbyint_f16(half %a) #0 {
 define half @v_fneg_sin_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_sin_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x3e22f983, v0
 ; SI-NEXT:    v_fract_f32_e32 v0, v0
 ; SI-NEXT:    v_sin_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_sin_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, 0xb118, v0
 ; VI-NEXT:    v_fract_f16_e32 v0, v0
 ; VI-NEXT:    v_sin_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_sin_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, 0xb118, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sin_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sin = call half @llvm.sin.f16(half %a)
   %fneg = fneg half %sin
@@ -3979,20 +3979,20 @@ define half @v_fneg_sin_f16(half %a) #0 {
 define half @v_fneg_canonicalize_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_canonicalize_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_canonicalize_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_canonicalize_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc = call half @llvm.canonicalize.f16(half %a)
   %fneg = fneg half %trunc
@@ -4006,7 +4006,6 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
 define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 {
 ; SI-LABEL: v_fneg_copytoreg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
@@ -4017,6 +4016,7 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_cbranch_execz .LBB81_2
 ; SI-NEXT:  ; %bb.1: ; %if
@@ -4032,18 +4032,18 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    flat_store_short v[0:1], v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_copytoreg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 1, v6
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_cbranch_execz .LBB81_2
 ; VI-NEXT:  ; %bb.1: ; %if
@@ -4053,12 +4053,11 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
 ; VI-NEXT:  .LBB81_2: ; %endif
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    flat_store_short v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_copytoreg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -4066,6 +4065,7 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 1, v6
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v5
 ; GFX11-NEXT:    s_cbranch_execz .LBB81_2
 ; GFX11-NEXT:  ; %bb.1: ; %if
@@ -4076,6 +4076,7 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4103,7 +4104,6 @@ endif:
 define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
 ; SI-LABEL: v_fneg_inlineasm_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -4113,24 +4113,25 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    ; use v1
 ; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inlineasm_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; use v0
 ; VI-NEXT:    ;;#ASMEND
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_inlineasm_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v0
 ; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %a, %b
   %fneg = fneg half %mul
@@ -4146,7 +4147,6 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
 define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 {
 ; SI-LABEL: v_fneg_inlineasm_multi_use_src_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -4156,27 +4156,28 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a,
 ; SI-NEXT:    ;;#ASMSTART
 ; SI-NEXT:    ; use v1
 ; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inlineasm_multi_use_src_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, v2, v3
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; VI-NEXT:    ;;#ASMSTART
 ; VI-NEXT:    ; use v1
 ; VI-NEXT:    ;;#ASMEND
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_inlineasm_multi_use_src_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v1
 ; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4196,7 +4197,6 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a,
 define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) #0 {
 ; SI-LABEL: multiuse_fneg_2_vop3_users_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -4205,23 +4205,24 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; SI-NEXT:    v_fma_f32 v0, -v3, v1, v2
 ; SI-NEXT:    v_fma_f32 v1, -v3, v2, 2.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: multiuse_fneg_2_vop3_users_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_fma_f16 v3, -v0, v1, v2
 ; VI-NEXT:    v_fma_f16 v1, -v0, v2, 2.0
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: multiuse_fneg_2_vop3_users_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v3, -v0, v1, v2
 ; GFX11-NEXT:    v_fma_f16 v1, -v0, v2, 2.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
@@ -4236,7 +4237,6 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c)
 define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) #0 {
 ; SI-LABEL: multiuse_fneg_2_vop2_users_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -4245,23 +4245,24 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_mul_f32_e32 v0, v3, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, v3, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: multiuse_fneg_2_vop2_users_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v3, -v0, v1
 ; VI-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: multiuse_fneg_2_vop2_users_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v3, -v0, v1
 ; GFX11-NEXT:    v_mul_f16_e64 v1, -v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %mul0 = fmul half %fneg.a, %b
@@ -4275,7 +4276,6 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c)
 define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, half %a, half %b, half %c) #0 {
 ; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
 ; SI-NEXT:    v_cvt_f16_f32_e64 v1, -v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
@@ -4284,20 +4284,21 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, v1, v0, 2.0
 ; SI-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_fma_f16 v0, -v2, v3, 2.0
 ; VI-NEXT:    v_mul_f16_e64 v1, -v2, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: multiuse_fneg_vop2_vop3_users_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, -v2, v3, 2.0
 ; GFX11-NEXT:    v_mul_f16_e64 v1, -v2, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4317,7 +4318,6 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out,
 define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
 ; SI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v3
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v4
@@ -4329,11 +4329,11 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou
 ; SI-SAFE-NEXT:    v_fma_f32 v1, v1, v0, 2.0
 ; SI-SAFE-NEXT:    v_mul_f32_e64 v0, -v1, v2
 ; SI-SAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v0, v3
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v4
@@ -4345,40 +4345,41 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou
 ; SI-NSZ-NEXT:    v_fma_f32 v1, v1, -v0, -2.0
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v0, v1, v2
 ; SI-NSZ-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, v2, v3, 2.0
 ; VI-SAFE-NEXT:    v_mul_f16_e64 v0, -v1, v4
 ; VI-SAFE-NEXT:    v_mul_f16_e64 v1, -v1, v5
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v1, v2, -v3, -2.0
 ; VI-NSZ-NEXT:    v_mul_f16_e32 v0, v1, v4
 ; VI-NSZ-NEXT:    v_mul_f16_e32 v1, v1, v5
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fma_f16 v1, v2, v3, 2.0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_mul_f16_e64 v0, -v1, v4
 ; GFX11-SAFE-NEXT:    v_mul_f16_e64 v1, -v1, v5
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v1, v2, -v3, -2.0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v0, v1, v4
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v1, v1, v5
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4399,7 +4400,6 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou
 define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
 ; SI-LABEL: one_use_cost_to_fold_into_src_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
@@ -4408,21 +4408,22 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_trunc_f32_e32 v1, v1
 ; SI-NEXT:    v_fma_f32 v0, -v1, v2, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: one_use_cost_to_fold_into_src_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f16_e32 v0, v2
 ; VI-NEXT:    v_fma_f16 v0, -v0, v3, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: one_use_cost_to_fold_into_src_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_trunc_f16_e32 v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f16 v0, -v0, v3, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4437,7 +4438,6 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h
 define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
 ; SI-LABEL: multi_use_cost_to_fold_into_src:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
@@ -4449,23 +4449,24 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha
 ; SI-NEXT:    v_trunc_f32_e32 v1, v1
 ; SI-NEXT:    v_fma_f32 v0, -v1, v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: multi_use_cost_to_fold_into_src:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_trunc_f16_e32 v1, v2
 ; VI-NEXT:    v_fma_f16 v0, -v1, v3, v4
 ; VI-NEXT:    v_mul_f16_e32 v1, v1, v5
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: multi_use_cost_to_fold_into_src:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_trunc_f16_e32 v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f16 v0, -v1, v3, v4
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -4486,7 +4487,6 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha
 define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %arg1, <2 x half> %arg2) #0 {
 ; SI-LABEL: fneg_fma_fneg_dagcombine_loop:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -4512,11 +4512,11 @@ define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %ar
 ; SI-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; SI-NEXT:    v_mul_f32_e32 v1, v1, v5
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: fneg_fma_fneg_dagcombine_loop:
 ; VI:       ; %bb.0: ; %bb
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x8000
 ; VI-NEXT:    v_fma_f16 v3, v1, -v2, s4
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -4527,15 +4527,16 @@ define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %ar
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fma_fneg_dagcombine_loop:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v2, 0x8000 op_sel_hi:[1,1,0] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v1, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i3 = call fast <2 x half> @llvm.fma.v2f16(<2 x half> %arg1, <2 x half> %arg2, <2 x half> zeroinitializer)
@@ -4550,24 +4551,24 @@ bb:
 define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 {
 ; SI-LABEL: nnan_fmul_neg1_to_fneg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: nnan_fmul_neg1_to_fneg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: nnan_fmul_neg1_to_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -1.0
   %add = fmul nnan half %mul, %y
@@ -4579,24 +4580,24 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 {
 define half @denormal_fmul_neg1_to_fneg(half %x, half %y) {
 ; SI-LABEL: denormal_fmul_neg1_to_fneg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: denormal_fmul_neg1_to_fneg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: denormal_fmul_neg1_to_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul nnan half %x, -1.0
   %add = fmul half %mul, %y
@@ -4607,7 +4608,6 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) {
 define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
 ; SI-LABEL: denorm_snan_fmul_neg1_to_fneg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -4615,21 +4615,22 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: denorm_snan_fmul_neg1_to_fneg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, v0, -v0
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: denorm_snan_fmul_neg1_to_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %canonical = fmul half %x, %x
   %mul = fmul half %canonical, -1.0
@@ -4640,27 +4641,27 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
 define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 {
 ; SI-LABEL: flush_snan_fmul_neg1_to_fneg:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: flush_snan_fmul_neg1_to_fneg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flush_snan_fmul_neg1_to_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %quiet = call half @llvm.canonicalize.f16(half %x)
   %mul = fmul half %quiet, -1.0
@@ -4671,7 +4672,6 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 {
 define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 ; SI-LABEL: fadd_select_fneg_fneg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -4681,23 +4681,24 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: fadd_select_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fadd_select_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg half %x
@@ -4711,7 +4712,6 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; SI-LABEL: fadd_select_fneg_fneg_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -4731,25 +4731,26 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; SI-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: fadd_select_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fadd_select_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg <2 x half> %x
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index 1c9ad76190e88e5..3c1861f2ce6ce24 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -9,8 +9,8 @@
 define half @v_fneg_rcp_f16(half %a) #0 {
 ; GCN-LABEL: v_fneg_rcp_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f16_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %rcp = call half @llvm.amdgcn.rcp.f16(half %a)
   %fneg = fneg half %rcp
@@ -20,8 +20,8 @@ define half @v_fneg_rcp_f16(half %a) #0 {
 define half @v_fneg_rcp_fneg_f16(half %a) #0 {
 ; GCN-LABEL: v_fneg_rcp_fneg_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %rcp = call half @llvm.amdgcn.rcp.f16(half %fneg.a)
@@ -32,10 +32,10 @@ define half @v_fneg_rcp_fneg_f16(half %a) #0 {
 define { half, half } @v_fneg_rcp_store_use_fneg_f16(half %a) #0 {
 ; GCN-LABEL: v_fneg_rcp_store_use_fneg_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f16_e32 v2, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %rcp = call half @llvm.amdgcn.rcp.f16(half %fneg.a)
@@ -48,10 +48,10 @@ define { half, half } @v_fneg_rcp_store_use_fneg_f16(half %a) #0 {
 define { half, half } @v_fneg_rcp_multi_use_fneg_f16(half %a, half %c) #0 {
 ; GCN-LABEL: v_fneg_rcp_multi_use_fneg_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f16_e32 v2, v0
 ; GCN-NEXT:    v_mul_f16_e64 v1, -v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg half %a
   %rcp = call half @llvm.amdgcn.rcp.f16(half %fneg.a)
@@ -69,8 +69,8 @@ define { half, half } @v_fneg_rcp_multi_use_fneg_f16(half %a, half %c) #0 {
 define half @v_fneg_amdgcn_sin_f16(half %a) #0 {
 ; GCN-LABEL: v_fneg_amdgcn_sin_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sin_f16_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sin = call half @llvm.amdgcn.sin.f16(half %a)
   %fneg = fneg half %sin
@@ -84,12 +84,12 @@ define half @v_fneg_amdgcn_sin_f16(half %a) #0 {
 define { float, float } @v_fneg_interp_p1_f16(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_interp_p1_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v1, v0, -v1
 ; GCN-NEXT:    s_mov_b32 m0, 0
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
 ; GCN-NEXT:    v_interp_p1ll_f16 v0, v1, attr0.x
 ; GCN-NEXT:    v_interp_p1ll_f16 v1, v1, attr0.y
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -103,13 +103,13 @@ define { float, float } @v_fneg_interp_p1_f16(float %a, float %b) #0 {
 define { half, half } @v_fneg_interp_p2_f16(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_interp_p2_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v1, v0, -v1
 ; GCN-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GCN-NEXT:    s_mov_b32 m0, 0
 ; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
 ; GCN-NEXT:    v_interp_p2_f16 v0, v1, attr0.x, v2
 ; GCN-NEXT:    v_interp_p2_f16 v1, v1, attr0.y, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -129,8 +129,8 @@ define half @v_fneg_arithmetic_fence_f16(half %a) #0 {
 ; GCN-LABEL: v_fneg_arithmetic_fence_f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    ;ARITH_FENCE
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fence = call half @llvm.arithmetic.fence.f16(half %a)
   %fneg = fneg half %fence
@@ -140,10 +140,10 @@ define half @v_fneg_arithmetic_fence_f16(half %a) #0 {
 define half @v_fneg_arithmetic_fence_fmul_f16(half %a, half %b) #0 {
 ; GCN-LABEL: v_fneg_arithmetic_fence_fmul_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GCN-NEXT:    ;ARITH_FENCE
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %a, %b
   %fence = call half @llvm.arithmetic.fence.f16(half %mul)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 203c6223e762f20..5c6492cb73e16cc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -12,15 +12,15 @@
 define float @v_fneg_add_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_sub_f32_e64 v0, -v0, v1
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %a, %b
   %fneg = fneg float %add
@@ -30,9 +30,9 @@ define float @v_fneg_add_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_add_store_use_add_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_add_store_use_add_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %a, %b
   %fneg = fneg float %add
@@ -44,17 +44,17 @@ define { float, float } @v_fneg_add_store_use_add_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_add_multi_use_add_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_multi_use_add_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_multi_use_add_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_sub_f32_e64 v0, -v0, v1
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %a, %b
   %fneg = fneg float %add
@@ -68,15 +68,15 @@ define { float, float } @v_fneg_add_multi_use_add_f32(float %a, float %b) #0 {
 define float @v_fneg_add_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_fneg_x_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_fneg_x_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %add = fadd float %fneg.a, %b
@@ -87,15 +87,15 @@ define float @v_fneg_add_fneg_x_f32(float %a, float %b) #0 {
 define float @v_fneg_add_x_fneg_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_x_fneg_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_x_fneg_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_sub_f32_e32 v0, v1, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg float %b
   %add = fadd float %a, %fneg.b
@@ -106,15 +106,15 @@ define float @v_fneg_add_x_fneg_f32(float %a, float %b) #0 {
 define float @v_fneg_add_fneg_fneg_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_fneg_fneg_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_sub_f32_e64 v0, -v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_fneg_fneg_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fneg.b = fneg float %b
@@ -126,19 +126,19 @@ define float @v_fneg_add_fneg_fneg_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_add_store_use_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-SAFE-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GCN-NSZ-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %add = fadd float %fneg.a, %b
@@ -151,19 +151,19 @@ define { float, float } @v_fneg_add_store_use_fneg_x_f32(float %a, float %b) #0
 define { float, float } @v_fneg_add_multi_use_fneg_x_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v1, v1, v0
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v1
 ; GCN-SAFE-NEXT:    v_mul_f32_e64 v1, -v0, v2
 ; GCN-SAFE-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_sub_f32_e32 v3, v0, v1
 ; GCN-NSZ-NEXT:    v_mul_f32_e64 v1, -v0, v2
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %add = fadd float %fneg.a, %b
@@ -338,15 +338,15 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6
 define double @v_fneg_add_f64(double %a, double %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_f64:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[2:3]
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd double %a, %b
   %fneg = fneg double %add
@@ -356,10 +356,10 @@ define double @v_fneg_add_f64(double %a, double %b) #0 {
 define { double, double } @v_fneg_add_store_use_add_f64(double %a, double %b) #0 {
 ; GCN-LABEL: v_fneg_add_store_use_add_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd double %a, %b
   %fneg = fneg double %add
@@ -371,26 +371,26 @@ define { double, double } @v_fneg_add_store_use_add_f64(double %a, double %b) #0
 define { double, double } @v_fneg_add_multi_use_add_f64(double %a, double %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f64:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; SI-SAFE-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v1, v4
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_multi_use_add_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[2:3]
 ; GCN-NSZ-NEXT:    v_mul_f64 v[2:3], v[0:1], -4.0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f64:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; VI-SAFE-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd double %a, %b
   %fneg = fneg double %add
@@ -404,15 +404,15 @@ define { double, double } @v_fneg_add_multi_use_add_f64(double %a, double %b) #0
 define double @v_fneg_add_fneg_x_f64(double %a, double %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_fneg_x_f64:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f64 v[0:1], v[2:3], -v[0:1]
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_fneg_x_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %add = fadd double %fneg.a, %b
@@ -423,15 +423,15 @@ define double @v_fneg_add_fneg_x_f64(double %a, double %b) #0 {
 define double @v_fneg_add_x_fneg_f64(double %a, double %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_x_fneg_f64:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_x_fneg_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[0:1], v[2:3], -v[0:1]
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg double %b
   %add = fadd double %a, %fneg.b
@@ -442,15 +442,15 @@ define double @v_fneg_add_x_fneg_f64(double %a, double %b) #0 {
 define double @v_fneg_add_fneg_fneg_f64(double %a, double %b) #0 {
 ; GCN-SAFE-LABEL: v_fneg_add_fneg_fneg_f64:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_add_f64 v[0:1], -v[0:1], -v[2:3]
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_fneg_fneg_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fneg.b = fneg double %b
@@ -462,43 +462,43 @@ define double @v_fneg_add_fneg_fneg_f64(double %a, double %b) #0 {
 define { double, double } @v_fneg_add_store_use_fneg_x_f64(double %a, double %b) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f64:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v5, v1
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-SAFE-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v5
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v2, v4
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f64:
 ; SI-NSZ:       ; %bb.0:
-; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    v_mov_b32_e32 v5, v1
 ; SI-NSZ-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NSZ-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
 ; SI-NSZ-NEXT:    v_xor_b32_e32 v3, 0x80000000, v5
 ; SI-NSZ-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f64:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f64 v[4:5], v[2:3], -v[0:1]
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, v0
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v5
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v0, v4
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f64:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
 ; VI-NSZ-NEXT:    v_xor_b32_e32 v3, 0x80000000, v1
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, v0
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v0, v4
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %add = fadd double %fneg.a, %b
@@ -511,30 +511,30 @@ define { double, double } @v_fneg_add_store_use_fneg_x_f64(double %a, double %b)
 define { double, double } @v_fneg_add_multi_use_fneg_x_f64(double %a, double %b, double %c) #0 {
 ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f64:
 ; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    v_add_f64 v[6:7], v[2:3], -v[0:1]
 ; SI-SAFE-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[4:5]
 ; SI-SAFE-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v0, v6
 ; SI-SAFE-NEXT:    v_mov_b32_e32 v1, v7
+; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f64:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_add_f64 v[6:7], v[0:1], -v[2:3]
 ; GCN-NSZ-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[4:5]
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v0, v6
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, v7
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f64:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f64 v[6:7], v[2:3], -v[0:1]
 ; VI-SAFE-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[4:5]
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v7
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v0, v6
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %add = fadd double %fneg.a, %b
@@ -749,8 +749,8 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t
 define float @v_fneg_mul_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -760,9 +760,9 @@ define float @v_fneg_mul_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_mul_store_use_mul_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_store_use_mul_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v1, v0, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -774,9 +774,9 @@ define { float, float } @v_fneg_mul_store_use_mul_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_mul_multi_use_mul_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_multi_use_mul_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -789,8 +789,8 @@ define { float, float } @v_fneg_mul_multi_use_mul_f32(float %a, float %b) #0 {
 define float @v_fneg_mul_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = fmul float %fneg.a, %b
@@ -801,8 +801,8 @@ define float @v_fneg_mul_fneg_x_f32(float %a, float %b) #0 {
 define float @v_fneg_mul_x_fneg_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_x_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg float %b
   %mul = fmul float %a, %fneg.b
@@ -813,8 +813,8 @@ define float @v_fneg_mul_x_fneg_f32(float %a, float %b) #0 {
 define float @v_fneg_mul_fneg_fneg_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_fneg_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fneg.b = fneg float %b
@@ -826,10 +826,10 @@ define float @v_fneg_mul_fneg_fneg_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_mul_store_use_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_store_use_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = fmul float %fneg.a, %b
@@ -842,10 +842,10 @@ define { float, float } @v_fneg_mul_store_use_fneg_x_f32(float %a, float %b) #0
 define { float, float } @v_fneg_mul_multi_use_fneg_x_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_fneg_mul_multi_use_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v3, v0, v1
 ; GCN-NEXT:    v_mul_f32_e64 v1, -v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = fmul float %fneg.a, %b
@@ -863,10 +863,10 @@ define { float, float } @v_fneg_mul_multi_use_fneg_x_f32(float %a, float %b, flo
 define float @v_fneg_minnum_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -876,8 +876,8 @@ define float @v_fneg_minnum_f32_ieee(float %a, float %b) #0 {
 define float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -887,8 +887,8 @@ define float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #4 {
 define float @v_fneg_self_minnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_self_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %a)
   %min.fneg = fneg float %min
@@ -898,8 +898,8 @@ define float @v_fneg_self_minnum_f32_ieee(float %a) #0 {
 define float @v_fneg_self_minnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %a)
   %min.fneg = fneg float %min
@@ -909,9 +909,9 @@ define float @v_fneg_self_minnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_posk_minnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_posk_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 4.0, float %a)
   %fneg = fneg float %min
@@ -921,8 +921,8 @@ define float @v_fneg_posk_minnum_f32_ieee(float %a) #0 {
 define float @v_fneg_posk_minnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e64 v0, -v0, -4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 4.0, float %a)
   %fneg = fneg float %min
@@ -932,9 +932,9 @@ define float @v_fneg_posk_minnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_negk_minnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_negk_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float -4.0, float %a)
   %fneg = fneg float %min
@@ -944,8 +944,8 @@ define float @v_fneg_negk_minnum_f32_ieee(float %a) #0 {
 define float @v_fneg_negk_minnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e64 v0, -v0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float -4.0, float %a)
   %fneg = fneg float %min
@@ -955,9 +955,9 @@ define float @v_fneg_negk_minnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_0_minnum_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_0_minnum_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call nnan float @llvm.minnum.f32(float 0.0, float %a)
   %fneg = fneg float %min
@@ -967,9 +967,9 @@ define float @v_fneg_0_minnum_f32(float %a) #0 {
 define float @v_fneg_neg0_minnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_neg0_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float -0.0, float %a)
   %fneg = fneg float %min
@@ -979,17 +979,17 @@ define float @v_fneg_neg0_minnum_f32_ieee(float %a) #0 {
 define float @v_fneg_inv2pi_minnum_f32(float %a) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe22f983, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; VI-NEXT:    v_min_f32_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
   %fneg = fneg float %min
@@ -999,16 +999,16 @@ define float @v_fneg_inv2pi_minnum_f32(float %a) #0 {
 define float @v_fneg_neg_inv2pi_minnum_f32(float %a) #0 {
 ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0x3e22f983, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; VI-NEXT:    v_max_f32_e32 v0, 0.15915494, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a)
   %fneg = fneg float %min
@@ -1018,19 +1018,19 @@ define float @v_fneg_neg_inv2pi_minnum_f32(float %a) #0 {
 define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e32 v0, v0, v0
 ; VI-NEXT:    v_min_f16_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
   %fneg = fneg half %min
@@ -1040,18 +1040,18 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0x3e230000, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; VI-NEXT:    v_max_f16_e32 v0, 0.15915494, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half 0xHB118, half %a)
   %fneg = fneg half %min
@@ -1061,19 +1061,19 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 define double @v_fneg_inv2pi_minnum_f64(double %a) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; SI-NEXT:    s_mov_b32 s4, 0x6dc9c882
 ; SI-NEXT:    s_mov_b32 s5, 0xbfc45f30
 ; SI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; VI-NEXT:    v_min_f64 v[0:1], v[0:1], 0.15915494309189532
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a)
   %fneg = fneg double %min
@@ -1083,18 +1083,18 @@ define double @v_fneg_inv2pi_minnum_f64(double %a) #0 {
 define double @v_fneg_neg_inv2pi_minnum_f64(double %a) #0 {
 ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; SI-NEXT:    s_mov_b32 s4, 0x6dc9c882
 ; SI-NEXT:    s_mov_b32 s5, 0x3fc45f30
 ; SI-NEXT:    v_max_f64 v[0:1], v[0:1], s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; VI-NEXT:    v_max_f64 v[0:1], v[0:1], 0.15915494309189532
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a)
   %fneg = fneg double %min
@@ -1104,8 +1104,8 @@ define double @v_fneg_neg_inv2pi_minnum_f64(double %a) #0 {
 define float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e64 v0, -v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float -0.0, float %a)
   %fneg = fneg float %min
@@ -1115,10 +1115,10 @@ define float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_0_minnum_foldable_use_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_0_minnum_foldable_use_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 0.0, float %a)
   %fneg = fneg float %min
@@ -1129,18 +1129,18 @@ define float @v_fneg_0_minnum_foldable_use_f32_ieee(float %a, float %b) #0 {
 define float @v_fneg_inv2pi_minnum_foldable_use_f32(float %a, float %b) #0 {
 ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe22f983, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; VI-NEXT:    v_min_f32_e32 v0, 0.15915494, v0
 ; VI-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a)
   %fneg = fneg float %min
@@ -1151,9 +1151,9 @@ define float @v_fneg_inv2pi_minnum_foldable_use_f32(float %a, float %b) #0 {
 define float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_0_minnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float 0.0, float %a)
   %fneg = fneg float %min
@@ -1164,11 +1164,11 @@ define float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #4 {
 define { float, float } @v_fneg_minnum_multi_use_minnum_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_minnum_multi_use_minnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -1181,9 +1181,9 @@ define { float, float } @v_fneg_minnum_multi_use_minnum_f32_ieee(float %a, float
 define <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_minnum_multi_use_minnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -1200,10 +1200,10 @@ define <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %
 define float @v_fneg_maxnum_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -1213,8 +1213,8 @@ define float @v_fneg_maxnum_f32_ieee(float %a, float %b) #0 {
 define float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -1224,8 +1224,8 @@ define float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #4 {
 define float @v_fneg_self_maxnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_self_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %a)
   %max.fneg = fneg float %max
@@ -1235,8 +1235,8 @@ define float @v_fneg_self_maxnum_f32_ieee(float %a) #0 {
 define float @v_fneg_self_maxnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %a)
   %max.fneg = fneg float %max
@@ -1246,9 +1246,9 @@ define float @v_fneg_self_maxnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_posk_maxnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_posk_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float 4.0, float %a)
   %fneg = fneg float %max
@@ -1258,8 +1258,8 @@ define float @v_fneg_posk_maxnum_f32_ieee(float %a) #0 {
 define float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e64 v0, -v0, -4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float 4.0, float %a)
   %fneg = fneg float %max
@@ -1269,9 +1269,9 @@ define float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_negk_maxnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_negk_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float -4.0, float %a)
   %fneg = fneg float %max
@@ -1281,8 +1281,8 @@ define float @v_fneg_negk_maxnum_f32_ieee(float %a) #0 {
 define float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e64 v0, -v0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float -4.0, float %a)
   %fneg = fneg float %max
@@ -1292,9 +1292,9 @@ define float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_0_maxnum_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_0_maxnum_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a)
   %fneg = fneg float %max
@@ -1304,9 +1304,9 @@ define float @v_fneg_0_maxnum_f32(float %a) #0 {
 define float @v_fneg_neg0_maxnum_f32_ieee(float %a) #0 {
 ; GCN-LABEL: v_fneg_neg0_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float -0.0, float %a)
   %fneg = fneg float %max
@@ -1316,8 +1316,8 @@ define float @v_fneg_neg0_maxnum_f32_ieee(float %a) #0 {
 define float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e64 v0, -v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float -0.0, float %a)
   %fneg = fneg float %max
@@ -1327,10 +1327,10 @@ define float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #4 {
 define float @v_fneg_0_maxnum_foldable_use_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_0_maxnum_foldable_use_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
   %fneg = fneg float %max
@@ -1341,9 +1341,9 @@ define float @v_fneg_0_maxnum_foldable_use_f32_ieee(float %a, float %b) #0 {
 define float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_0_maxnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float 0.0, float %a)
   %fneg = fneg float %max
@@ -1354,11 +1354,11 @@ define float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #4 {
 define { float, float } @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -1371,9 +1371,9 @@ define { float, float } @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float %a, float
 define <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -1390,15 +1390,15 @@ define <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %
 define float @v_fneg_fma_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
   %fneg = fneg float %fma
@@ -1408,9 +1408,9 @@ define float @v_fneg_fma_f32(float %a, float %b, float %c) #0 {
 define { float, float } @v_fneg_fma_store_use_fma_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_fneg_fma_store_use_fma_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v1, v0, v1, v2
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
   %fneg = fneg float %fma
@@ -1422,17 +1422,17 @@ define { float, float } @v_fneg_fma_store_use_fma_f32(float %a, float %b, float
 define { float, float } @v_fneg_fma_multi_use_fma_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_multi_use_fma_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v1, v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_multi_use_fma_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
   %fneg = fneg float %fma
@@ -1445,15 +1445,15 @@ define { float, float } @v_fneg_fma_multi_use_fma_f32(float %a, float %b, float
 define float @v_fneg_fma_fneg_x_y_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_fneg_x_y_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, -v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_fneg_x_y_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
@@ -1464,15 +1464,15 @@ define float @v_fneg_fma_fneg_x_y_f32(float %a, float %b, float %c) #0 {
 define float @v_fneg_fma_x_fneg_y_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_x_fneg_y_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, v0, -v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_x_fneg_y_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg float %b
   %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c)
@@ -1483,15 +1483,15 @@ define float @v_fneg_fma_x_fneg_y_f32(float %a, float %b, float %c) #0 {
 define float @v_fneg_fma_fneg_fneg_y_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, -v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fneg.b = fneg float %b
@@ -1503,15 +1503,15 @@ define float @v_fneg_fma_fneg_fneg_y_f32(float %a, float %b, float %c) #0 {
 define float @v_fneg_fma_fneg_x_fneg_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, -v0, v1, -v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fneg.c = fneg float %c
@@ -1523,15 +1523,15 @@ define float @v_fneg_fma_fneg_x_fneg_f32(float %a, float %b, float %c) #0 {
 define float @v_fneg_fma_x_y_fneg_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_x_y_fneg_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, v0, v1, -v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_x_y_fneg_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, -v1, v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.c = fneg float %c
   %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c)
@@ -1542,19 +1542,19 @@ define float @v_fneg_fma_x_y_fneg_f32(float %a, float %b, float %c) #0 {
 define { float, float } @v_fneg_fma_store_use_fneg_x_y_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v0
 ; GCN-SAFE-NEXT:    v_fma_f32 v0, -v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-SAFE-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_xor_b32_e32 v3, 0x80000000, v0
 ; GCN-NSZ-NEXT:    v_fma_f32 v0, v0, v1, -v2
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v1, v3
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
@@ -1567,19 +1567,19 @@ define { float, float } @v_fneg_fma_store_use_fneg_x_y_f32(float %a, float %b, f
 define { float, float } @v_fneg_fma_multi_use_fneg_x_y_f32(float %a, float %b, float %c, float %d) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v1, -v0, v1, v2
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GCN-SAFE-NEXT:    v_mul_f32_e64 v1, -v0, v3
 ; GCN-SAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v2, v0, v1, -v2
 ; GCN-NSZ-NEXT:    v_mul_f32_e64 v1, -v0, v3
 ; GCN-NSZ-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
@@ -1597,15 +1597,15 @@ define { float, float } @v_fneg_fma_multi_use_fneg_x_y_f32(float %a, float %b, f
 define float @v_fneg_fmad_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fmad_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v2
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fmad_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_mad_f32 v0, v0, -v1, -v2
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
   %fneg = fneg float %fma
@@ -1615,7 +1615,6 @@ define float @v_fneg_fmad_f32(float %a, float %b, float %c) #0 {
 define <4 x float> @v_fneg_fmad_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fmad_v4f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mac_f32_e32 v11, v3, v7
 ; GCN-SAFE-NEXT:    v_mac_f32_e32 v10, v2, v6
 ; GCN-SAFE-NEXT:    v_mac_f32_e32 v9, v1, v5
@@ -1624,15 +1623,16 @@ define <4 x float> @v_fneg_fmad_v4f32(<4 x float> %a, <4 x float> %b, <4 x float
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v9
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80000000, v10
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v3, 0x80000000, v11
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fmad_v4f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_mad_f32 v0, v0, -v4, -v8
 ; GCN-NSZ-NEXT:    v_mad_f32 v1, v1, -v5, -v9
 ; GCN-NSZ-NEXT:    v_mad_f32 v2, v2, -v6, -v10
 ; GCN-NSZ-NEXT:    v_mad_f32 v3, v3, -v7, -v11
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   %fneg = fneg <4 x float> %fma
@@ -1642,17 +1642,17 @@ define <4 x float> @v_fneg_fmad_v4f32(<4 x float> %a, <4 x float> %b, <4 x float
 define { float, float } @v_fneg_fmad_multi_use_fmad_f32(float %a, float %b, float %c) #0 {
 ; GCN-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v2
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v2
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_mad_f32 v0, v0, -v1, -v2
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
   %fneg = fneg float %fma
@@ -1669,8 +1669,8 @@ define { float, float } @v_fneg_fmad_multi_use_fmad_f32(float %a, float %b, floa
 define double @v_fneg_fp_extend_f32_to_f64(float %a) #0 {
 ; GCN-LABEL: v_fneg_fp_extend_f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e64 v[0:1], -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext float %a to double
   %fneg = fneg double %fpext
@@ -1680,8 +1680,8 @@ define double @v_fneg_fp_extend_f32_to_f64(float %a) #0 {
 define double @v_fneg_fp_extend_fneg_f32_to_f64(float %a) #0 {
 ; GCN-LABEL: v_fneg_fp_extend_fneg_f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fpext = fpext float %fneg.a to double
@@ -1692,10 +1692,10 @@ define double @v_fneg_fp_extend_fneg_f32_to_f64(float %a) #0 {
 define { double, float } @v_fneg_fp_extend_store_use_fneg_f32_to_f64(float %a) #0 {
 ; GCN-LABEL: v_fneg_fp_extend_store_use_fneg_f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fpext = fpext float %fneg.a to double
@@ -1708,10 +1708,10 @@ define { double, float } @v_fneg_fp_extend_store_use_fneg_f32_to_f64(float %a) #
 define { double, double } @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(float %a) #0 {
 ; GCN-LABEL: v_fneg_multi_use_fp_extend_fneg_f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext float %a to double
   %fneg = fneg double %fpext
@@ -1723,19 +1723,19 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(float %a)
 define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(float %a) #0 {
 ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; SI-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; SI-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; VI-NEXT:    v_mul_f64 v[2:3], v[0:1], 4.0
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext float %a to double
   %fneg = fneg double %fpext
@@ -1749,16 +1749,16 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(f
 define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 {
 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to float
   %fneg = fneg float %fpext
@@ -1770,18 +1770,18 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0
 define { float, float } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(half %a) #0 {
 ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, 4.0, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; VI-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fpext = fpext half %a to float
   %fneg = fneg float %fpext
@@ -1798,8 +1798,8 @@ define { float, float } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(hal
 define float @v_fneg_fp_round_f64_to_f32(double %a) #0 {
 ; GCN-LABEL: v_fneg_fp_round_f64_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e64 v0, -v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to float
   %fneg = fneg float %fpround
@@ -1809,8 +1809,8 @@ define float @v_fneg_fp_round_f64_to_f32(double %a) #0 {
 define float @v_fneg_fp_round_fneg_f64_to_f32(double %a) #0 {
 ; GCN-LABEL: v_fneg_fp_round_fneg_f64_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to float
@@ -1821,11 +1821,11 @@ define float @v_fneg_fp_round_fneg_f64_to_f32(double %a) #0 {
 define { float, double } @v_fneg_fp_round_store_use_fneg_f64_to_f32(double %a) #0 {
 ; GCN-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v3, v[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to float
@@ -1838,19 +1838,19 @@ define { float, double } @v_fneg_fp_round_store_use_fneg_f64_to_f32(double %a) #
 define { float, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f32(double %a, double %c) #0 {
 ; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f64_e32 v4, v[0:1]
 ; SI-NEXT:    v_mul_f64 v[1:2], -v[0:1], v[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[2:3]
 ; VI-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
 ; VI-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to float
@@ -1864,15 +1864,15 @@ define { float, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f32(double %a, d
 define half @v_fneg_fp_round_f32_to_f16(float %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_f32_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_f32_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc float %a to half
   %fneg = fneg half %fpround
@@ -1882,15 +1882,15 @@ define half @v_fneg_fp_round_f32_to_f16(float %a) #0 {
 define half @v_fneg_fp_round_fneg_f32_to_f16(float %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fpround = fptrunc float %fneg.a to half
@@ -1901,9 +1901,9 @@ define half @v_fneg_fp_round_fneg_f32_to_f16(float %a) #0 {
 define { float, float } @v_fneg_multi_use_fp_round_fneg_f64_to_f32(double %a) #0 {
 ; GCN-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v1, v[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to float
   %fneg = fneg float %fpround
@@ -1915,19 +1915,19 @@ define { float, float } @v_fneg_multi_use_fp_round_fneg_f64_to_f32(double %a) #0
 define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 {
 ; SI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; SI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fpround = fptrunc float %fneg.a to half
@@ -1940,19 +1940,19 @@ define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 {
 define { half, float } @v_fneg_fp_round_multi_use_fneg_f32_to_f16(float %a, float %c) #0 {
 ; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-NEXT:    v_mul_f32_e64 v1, -v0, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e64 v1, -v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fpround = fptrunc float %fneg.a to half
@@ -1970,8 +1970,8 @@ define { half, float } @v_fneg_fp_round_multi_use_fneg_f32_to_f16(float %a, floa
 define float @v_fneg_rcp_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_rcp_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %rcp = call float @llvm.amdgcn.rcp.f32(float %a)
   %fneg = fneg float %rcp
@@ -1981,8 +1981,8 @@ define float @v_fneg_rcp_f32(float %a) #0 {
 define float @v_fneg_rcp_fneg_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_rcp_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
@@ -1993,12 +1993,11 @@ define float @v_fneg_rcp_fneg_f32(float %a) #0 {
 define void @v_fneg_rcp_store_use_fneg_f32(float %a, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
 ; GCN-LABEL: v_fneg_rcp_store_use_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v5, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    flat_store_dword v[1:2], v5
 ; GCN-NEXT:    flat_store_dword v[3:4], v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
@@ -2011,10 +2010,10 @@ define void @v_fneg_rcp_store_use_fneg_f32(float %a, ptr addrspace(1) %ptr0, ptr
 define { float, float } @v_fneg_rcp_multi_use_fneg_f32(float %a, float %c) #0 {
 ; GCN-LABEL: v_fneg_rcp_multi_use_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e64 v1, -v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a)
@@ -2029,10 +2028,10 @@ define { float, float } @v_fneg_rcp_multi_use_fneg_f32(float %a, float %c) #0 {
 define float @v_negated_rcp_f32(float %arg0, float %arg1) #1 {
 ; GCN-LABEL: v_negated_rcp_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, 2.0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg0 = fneg float %arg0
   %fma = call nsz float @llvm.fma.f32(float %neg.arg0, float %arg1, float -2.0)
@@ -2048,8 +2047,8 @@ define float @v_negated_rcp_f32(float %arg0, float %arg1) #1 {
 define float @v_fneg_mul_legacy_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e64 v0, v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %fneg = fneg float %mul
@@ -2059,9 +2058,9 @@ define float @v_fneg_mul_legacy_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_mul_legacy_store_use_mul_legacy_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_store_use_mul_legacy_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v1, v0, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %fneg = fneg float %mul
@@ -2073,9 +2072,9 @@ define { float, float } @v_fneg_mul_legacy_store_use_mul_legacy_f32(float %a, fl
 define { float, float } @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_multi_use_mul_legacy_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e64 v0, v0, -v1
 ; GCN-NEXT:    v_mul_legacy_f32_e64 v1, -v0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %fneg = fneg float %mul
@@ -2088,8 +2087,8 @@ define { float, float } @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float %a, fl
 define float @v_fneg_mul_legacy_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
@@ -2100,8 +2099,8 @@ define float @v_fneg_mul_legacy_fneg_x_f32(float %a, float %b) #0 {
 define float @v_fneg_mul_legacy_x_fneg_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_x_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.b = fneg float %b
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b)
@@ -2112,8 +2111,8 @@ define float @v_fneg_mul_legacy_x_fneg_f32(float %a, float %b) #0 {
 define float @v_fneg_mul_legacy_fneg_fneg_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_fneg_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e64 v0, v0, -v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fneg.b = fneg float %b
@@ -2125,10 +2124,10 @@ define float @v_fneg_mul_legacy_fneg_fneg_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_mul_legacy_store_use_fneg_x_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_store_use_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
@@ -2142,10 +2141,10 @@ define { float, float } @v_fneg_mul_legacy_store_use_fneg_x_f32(float %a, float
 define { float, float } @v_fneg_mul_legacy_multi_use_fneg_x_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_fneg_mul_legacy_multi_use_fneg_x_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v3, v0, v1
 ; GCN-NEXT:    v_mul_legacy_f32_e64 v1, -v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b)
@@ -2163,10 +2162,10 @@ define { float, float } @v_fneg_mul_legacy_multi_use_fneg_x_f32(float %a, float
 define float @v_fneg_sin_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_sin_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xbe22f983, v0
 ; GCN-NEXT:    v_fract_f32_e32 v0, v0
 ; GCN-NEXT:    v_sin_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sin = call float @llvm.sin.f32(float %a)
   %fneg = fneg float %sin
@@ -2176,8 +2175,8 @@ define float @v_fneg_sin_f32(float %a) #0 {
 define float @v_fneg_amdgcn_sin_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_amdgcn_sin_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sin_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sin = call float @llvm.amdgcn.sin.f32(float %a)
   %fneg = fneg float %sin
@@ -2191,8 +2190,8 @@ define float @v_fneg_amdgcn_sin_f32(float %a) #0 {
 define float @v_fneg_trunc_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_trunc_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trunc_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %trunc = call float @llvm.trunc.f32(float %a)
   %fneg = fneg float %trunc
@@ -2206,7 +2205,6 @@ define float @v_fneg_trunc_f32(float %a) #0 {
 define float @v_fneg_round_f32(float %a) #0 {
 ; GCN-SAFE-LABEL: v_fneg_round_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_trunc_f32_e32 v1, v0
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GCN-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -2215,11 +2213,11 @@ define float @v_fneg_round_f32(float %a) #0 {
 ; GCN-SAFE-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GCN-SAFE-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GCN-SAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fneg_round_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_trunc_f32_e32 v1, v0
 ; GCN-NSZ-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GCN-NSZ-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -2227,6 +2225,7 @@ define float @v_fneg_round_f32(float %a) #0 {
 ; GCN-NSZ-NEXT:    s_brev_b32 s4, -2
 ; GCN-NSZ-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GCN-NSZ-NEXT:    v_sub_f32_e64 v0, -v1, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %round = call float @llvm.round.f32(float %a)
   %fneg = fneg float %round
@@ -2240,8 +2239,8 @@ define float @v_fneg_round_f32(float %a) #0 {
 define float @v_fneg_rint_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_rint_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rndne_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %rint = call float @llvm.rint.f32(float %a)
   %fneg = fneg float %rint
@@ -2255,8 +2254,8 @@ define float @v_fneg_rint_f32(float %a) #0 {
 define float @v_fneg_nearbyint_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_nearbyint_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rndne_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %nearbyint = call float @llvm.nearbyint.f32(float %a)
   %fneg = fneg float %nearbyint
@@ -2270,8 +2269,8 @@ define float @v_fneg_nearbyint_f32(float %a) #0 {
 define float @v_fneg_canonicalize_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_canonicalize_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %trunc = call float @llvm.canonicalize.f32(float %a)
   %fneg = fneg float %trunc
@@ -2286,8 +2285,8 @@ define float @v_fneg_arithmetic_fence_f32(float %a) #0 {
 ; GCN-LABEL: v_fneg_arithmetic_fence_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    ;ARITH_FENCE
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fence = call float @llvm.arithmetic.fence.f32(float %a)
   %fneg = fneg float %fence
@@ -2297,10 +2296,10 @@ define float @v_fneg_arithmetic_fence_f32(float %a) #0 {
 define float @v_fneg_arithmetic_fence_fmul_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_fneg_arithmetic_fence_fmul_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-NEXT:    ;ARITH_FENCE
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fence = call float @llvm.arithmetic.fence.f32(float %mul)
@@ -2315,20 +2314,20 @@ define float @v_fneg_arithmetic_fence_fmul_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_interp_p1_f32(float %a, float %b) #0 {
 ; SI-LABEL: v_fneg_interp_p1_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v1, v0, -v1
 ; SI-NEXT:    s_mov_b32 m0, 0
 ; SI-NEXT:    v_interp_p1_f32 v0, v1, attr0.x
 ; SI-NEXT:    v_interp_p1_f32 v1, v1, attr0.y
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_interp_p1_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v1, v0, -v1
 ; VI-NEXT:    s_mov_b32 m0, 0
 ; VI-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
 ; VI-NEXT:    v_interp_p1_f32_e32 v1, v1, attr0.y
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -2342,24 +2341,24 @@ define { float, float } @v_fneg_interp_p1_f32(float %a, float %b) #0 {
 define { float, float } @v_fneg_interp_p2_f32(float %a, float %b) #0 {
 ; SI-LABEL: v_fneg_interp_p2_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v2, v0, -v1
 ; SI-NEXT:    v_mov_b32_e32 v1, 4.0
 ; SI-NEXT:    v_mov_b32_e32 v0, 4.0
 ; SI-NEXT:    s_mov_b32 m0, 0
 ; SI-NEXT:    v_interp_p2_f32 v0, v2, attr0.x
 ; SI-NEXT:    v_interp_p2_f32 v1, v2, attr0.y
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_interp_p2_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v2, v0, -v1
 ; VI-NEXT:    v_mov_b32_e32 v1, 4.0
 ; VI-NEXT:    v_mov_b32_e32 v0, 4.0
 ; VI-NEXT:    s_mov_b32 m0, 0
 ; VI-NEXT:    v_interp_p2_f32_e32 v0, v2, attr0.x
 ; VI-NEXT:    v_interp_p2_f32_e32 v1, v2, attr0.y
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -2377,13 +2376,13 @@ define { float, float } @v_fneg_interp_p2_f32(float %a, float %b) #0 {
 define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %d) #0 {
 ; SI-LABEL: v_fneg_copytoreg_f32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_cbranch_execz .LBB118_2
 ; SI-NEXT:  ; %bb.1: ; %if
@@ -2393,18 +2392,18 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo
 ; SI-NEXT:  .LBB118_2: ; %endif
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    flat_store_dword v[0:1], v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_copytoreg_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v6, 0x3ff, v31
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v6
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_cbranch_execz .LBB118_2
 ; VI-NEXT:  ; %bb.1: ; %if
@@ -2414,7 +2413,7 @@ define void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, float %a, float %b, flo
 ; VI-NEXT:  .LBB118_2: ; %endif
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -2442,11 +2441,11 @@ endif:
 define float @v_fneg_inlineasm_f32(float %a, float %b, float %c, i32 %d) #0 {
 ; GCN-LABEL: v_fneg_inlineasm_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, -v1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -2462,12 +2461,12 @@ define float @v_fneg_inlineasm_f32(float %a, float %b, float %c, i32 %d) #0 {
 define float @v_fneg_inlineasm_multi_use_src_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %d) #0 {
 ; GCN-LABEL: v_fneg_inlineasm_multi_use_src_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v2, v3
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v1
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %a, %b
   %fneg = fneg float %mul
@@ -2484,10 +2483,10 @@ define float @v_fneg_inlineasm_multi_use_src_f32(ptr addrspace(1) %out, float %a
 define { float, float } @multiuse_fneg_2_vop3_users_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: multiuse_fneg_2_vop3_users_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v3, -v0, v1, v2
 ; GCN-NEXT:    v_fma_f32 v1, -v0, v2, 2.0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
@@ -2502,10 +2501,10 @@ define { float, float } @multiuse_fneg_2_vop3_users_f32(float %a, float %b, floa
 define { float, float } @multiuse_fneg_2_vop2_users_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: multiuse_fneg_2_vop2_users_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v3, -v0, v1
 ; GCN-NEXT:    v_mul_f32_e64 v1, -v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %mul0 = fmul float %fneg.a, %b
@@ -2519,10 +2518,10 @@ define { float, float } @multiuse_fneg_2_vop2_users_f32(float %a, float %b, floa
 define { float, float } @multiuse_fneg_vop2_vop3_users_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: multiuse_fneg_vop2_vop3_users_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v3, -v0, v1, 2.0
 ; GCN-NEXT:    v_mul_f32_e64 v1, -v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg float %a
   %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
@@ -2538,18 +2537,18 @@ define { float, float } @multiuse_fneg_vop2_vop3_users_f32(float %a, float %b, f
 define { float, float } @free_fold_src_code_size_cost_use_f32(ptr addrspace(1) %out, float %a, float %b, float %c, float %d) #0 {
 ; GCN-SAFE-LABEL: free_fold_src_code_size_cost_use_f32:
 ; GCN-SAFE:       ; %bb.0:
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_fma_f32 v1, v2, v3, 2.0
 ; GCN-SAFE-NEXT:    v_mul_f32_e64 v0, -v1, v4
 ; GCN-SAFE-NEXT:    v_mul_f32_e64 v1, -v1, v5
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: free_fold_src_code_size_cost_use_f32:
 ; GCN-NSZ:       ; %bb.0:
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_fma_f32 v1, v2, -v3, -2.0
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, v1, v4
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
   %fneg.fma0 = fneg float %fma0
@@ -2564,10 +2563,10 @@ define { float, float } @free_fold_src_code_size_cost_use_f32(ptr addrspace(1) %
 define { double, double } @free_fold_src_code_size_cost_use_f64(double %a, double %b, double %c, double %d) #0 {
 ; GCN-LABEL: free_fold_src_code_size_cost_use_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[0:1], v[2:3], 2.0
 ; GCN-NEXT:    v_mul_f64 v[0:1], -v[2:3], v[4:5]
 ; GCN-NEXT:    v_mul_f64 v[2:3], -v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
   %fneg.fma0 = fneg double %fma0
@@ -2584,9 +2583,9 @@ define { double, double } @free_fold_src_code_size_cost_use_f64(double %a, doubl
 define float @one_use_cost_to_fold_into_src_f32(float %a, float %b, float %c, float %d) #0 {
 ; GCN-LABEL: one_use_cost_to_fold_into_src_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trunc_f32_e32 v0, v0
 ; GCN-NEXT:    v_fma_f32 v0, -v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %trunc.a = call float @llvm.trunc.f32(float %a)
   %trunc.fneg.a = fneg float %trunc.a
@@ -2597,10 +2596,10 @@ define float @one_use_cost_to_fold_into_src_f32(float %a, float %b, float %c, fl
 define { float, float } @multi_use_cost_to_fold_into_src(float %a, float %b, float %c, float %d) #0 {
 ; GCN-LABEL: multi_use_cost_to_fold_into_src:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v0
 ; GCN-NEXT:    v_fma_f32 v0, -v4, v1, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v4, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %trunc.a = call float @llvm.trunc.f32(float %a)
   %trunc.fneg.a = fneg float %trunc.a
@@ -2618,7 +2617,6 @@ define { float, float } @multi_use_cost_to_fold_into_src(float %a, float %b, flo
 define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 {
 ; GCN-LABEL: fneg_fma_fneg_dagcombine_loop:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_brev_b32 s4, 1
 ; GCN-NEXT:    v_fma_f32 v3, v3, -v5, s4
 ; GCN-NEXT:    v_fma_f32 v2, v2, -v4, s4
@@ -2626,6 +2624,7 @@ define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float>
 ; GCN-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer)
@@ -2640,8 +2639,8 @@ bb:
 define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
 ; GCN-LABEL: nnan_fmul_neg1_to_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul float %x, -1.0
   %add = fmul nnan float %mul, %y
@@ -2653,8 +2652,8 @@ define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 {
 define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
 ; GCN-LABEL: denormal_fmul_neg1_to_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul nnan float %x, -1.0
   %add = fmul float %mul, %y
@@ -2665,9 +2664,9 @@ define float @denormal_fmul_neg1_to_fneg(float %x, float %y) {
 define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
 ; GCN-LABEL: denorm_snan_fmul_neg1_to_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, -v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %canonical = fmul float %x, %x
   %mul = fmul float %canonical, -1.0
@@ -2678,9 +2677,9 @@ define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) {
 define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
 ; GCN-LABEL: flush_snan_fmul_neg1_to_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %quiet = call float @llvm.canonicalize.f32(float %x)
   %mul = fmul float %quiet, -1.0
@@ -2691,10 +2690,10 @@ define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 {
 define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z) {
 ; GCN-LABEL: fadd_select_fneg_fneg_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GCN-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg float %x
@@ -2707,11 +2706,11 @@ define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z)
 define double @fadd_select_fneg_fneg_f64(i32 %arg0, double %x, double %y, double %z) {
 ; GCN-LABEL: fadd_select_fneg_fneg_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    v_add_f64 v[0:1], v[5:6], -v[1:2]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg double %x
@@ -2724,7 +2723,6 @@ define double @fadd_select_fneg_fneg_f64(i32 %arg0, double %x, double %y, double
 define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 ; SI-LABEL: fadd_select_fneg_fneg_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -2734,14 +2732,15 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: fadd_select_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg half %x
@@ -2755,7 +2754,6 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; SI-LABEL: fadd_select_fneg_fneg_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -2775,16 +2773,17 @@ define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x ha
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; SI-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: fadd_select_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %arg0, 0
   %neg.x = fneg <2 x half> %x
@@ -2836,11 +2835,11 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 %
 define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2851,11 +2850,11 @@ define float @v_fneg_select_infloop_regression_f32(float %arg, i1 %arg1) {
 define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2866,11 +2865,11 @@ define float @v_fneg_select_infloop_regression_f32_commute0(float %arg, i1 %arg1
 define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = fneg float %i
@@ -2881,11 +2880,11 @@ define float @v_fneg_select_infloop_regression_f32_commute1(float %arg, i1 %arg1
 define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f32_commute2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 0.0
   %i2 = fneg float %i
@@ -2897,11 +2896,11 @@ define float @v_fneg_select_infloop_regression_f32_commute2(float %arg, i1 %arg1
 define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2912,11 +2911,11 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32(float %arg, i1 %ar
 define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, 2.0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2927,11 +2926,11 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute0(float %ar
 define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 2.0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 2.0, float %arg
   %i2 = fneg float %i
@@ -2942,11 +2941,11 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute1(float %ar
 define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float 2.0
   %i2 = fneg float %i
@@ -2958,11 +2957,11 @@ define float @v_fneg_select_infloop_regression_inline_imm_f32_commute2(float %ar
 define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -2973,11 +2972,11 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32(float %arg, i1
 define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, -2.0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -2988,11 +2987,11 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute0(float
 define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -2.0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float -2.0, float %arg
   %i2 = fneg float %i
@@ -3003,11 +3002,11 @@ define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute1(float
 define float @v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_neg_inline_imm_f32_commute2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, -2.0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -2.0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float %arg, float -2.0
   %i2 = fneg float %i
@@ -3065,13 +3064,13 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
 define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, double 0.0, double %arg
   %i2 = fneg double %i
@@ -3121,23 +3120,23 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
 define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
 ; SI-LABEL: v_fneg_select_infloop_regression_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v0, -v0, 0, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_select_infloop_regression_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, half 0.0, half %arg
   %i2 = fneg half %i
@@ -3185,7 +3184,6 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
 define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %arg1) {
 ; SI-LABEL: v_fneg_select_infloop_regression_v2f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3198,16 +3196,17 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_select_infloop_regression_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x half> zeroinitializer, <2 x half> %arg
   %i2 = fneg <2 x half> %i
@@ -3263,7 +3262,6 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
 define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_select_infloop_regression_v2f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_bfrev_b32_e32 v3, 1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
@@ -3271,6 +3269,7 @@ define <2 x float> @v_fneg_select_infloop_regression_v2f32(<2 x float> %arg, i1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
   %i2 = fneg <2 x float> %i
@@ -3316,11 +3315,11 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 %
 define float @v_fabs_select_infloop_regression_f32(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fabs_select_infloop_regression_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, |v0|, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
@@ -3367,11 +3366,11 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1
 define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 ; GCN-LABEL: v_fneg_fabs_select_infloop_regression:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v0|, 0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %i = select i1 %arg1, float 0.0, float %arg
   %i2 = call float @llvm.fabs.f32(float %i)
@@ -3383,15 +3382,15 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
 define float @v_fmul_0_fsub_0_infloop_regression(float %arg) {
 ; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
 ; GCN-SAFE:       ; %bb.0: ; %bb
-; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, 0, v0
+; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression:
 ; GCN-NSZ:       ; %bb.0: ; %bb
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul float %arg, 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
index 0a2cb54877b8007..d758ffb0f51271d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll
@@ -12,18 +12,18 @@
 define { double, double } @testfn(double %arg, double %arg1, double %arg2) {
 ; GFX9-LABEL: testfn:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], -v[0:1]
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
 ; GFX9-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: testfn:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[4:5], -v[2:3]
 ; GFX10-NEXT:    v_add_f64 v[2:3], -v[2:3], -v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = fsub fast double 0.000000e+00, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 62e7a232d2b2569..f517760d8d79c28 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -6,19 +6,19 @@
 define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   %fneg = xor i32 %select, -2147483648
@@ -28,18 +28,17 @@ define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
 define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-LABEL: fneg_xor_select_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v4, -v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v5, -v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -47,6 +46,7 @@ define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v4, -v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v5, -v3, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select <2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1
   %fneg = xor <2 x i32> %select, <i32 -2147483648, i32 -2147483648>
@@ -56,35 +56,33 @@ define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i3
 define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX7-NEXT:    flat_store_dword v[3:4], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    global_store_dword v[3:4], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i32_multi_use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v1
 ; GFX11-NEXT:    global_store_b32 v[3:4], v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i32 %arg0, i32 %arg1
   store i32 %select, ptr addrspace(1) %ptr
@@ -95,21 +93,21 @@ define i32 @fneg_xor_select_i32_multi_use(i1 %cond, i32 %arg0, i32 %arg1, ptr ad
 define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -119,7 +117,6 @@ define i64 @fneg_xor_select_i64(i1 %cond, i64 %arg0, i64 %arg1) {
 define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1) {
 ; GCN-LABEL: fneg_xor_select_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -128,11 +125,11 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v8, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v7, -v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, -v9, -v5, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -142,6 +139,7 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, v4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, -v9, -v5, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select <2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1
   %fneg = xor <2 x i64> %select, <i64 9223372036854775808, i64 9223372036854775808>
@@ -151,21 +149,21 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6
 define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   %fneg = xor i16 %select, -32768
@@ -175,7 +173,6 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: fneg_xor_select_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -188,11 +185,11 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -204,11 +201,11 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
@@ -221,6 +218,7 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select <2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1
   %fneg = xor <2 x i16> %select, <i16 -32768, i16 -32768>
@@ -230,35 +228,33 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX7-NEXT:    flat_store_short v[3:4], v0
 ; GFX7-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    global_store_short v[3:4], v0, off
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i16_multi_use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
 ; GFX11-NEXT:    global_store_b16 v[3:4], v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   store i16 %select, ptr addrspace(1) %ptr
@@ -269,7 +265,6 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -277,12 +272,11 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -290,12 +284,11 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_multi_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -303,6 +296,7 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v4, -v2, vcc_lo
 ; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   store i64 %select, ptr addrspace(1) %ptr
@@ -313,7 +307,6 @@ define i64 @fneg_xor_select_i64_multi_user(i1 %cond, i64 %arg0, i64 %arg1, ptr a
 define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg1) {
 ; GCN-LABEL: select_fneg_xor_select_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -322,11 +315,11 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -338,6 +331,7 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -349,18 +343,17 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %arg1) {
 ; GCN-LABEL: select_fneg_select_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -369,6 +362,7 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg float %arg0
   %select0 = select i1 %cond0, float %arg1, float %fneg0
@@ -380,21 +374,21 @@ define float @select_fneg_select_f32(i1 %cond0, i1 %cond1, float %arg0, float %a
 define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
 ; GCN-LABEL: fneg_xor_select_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v4, -v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
   %fneg = fneg double %select
@@ -404,7 +398,6 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
 define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %arg1, ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -412,12 +405,11 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX7-NEXT:    flat_store_dwordx2 v[5:6], v[0:1]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -425,12 +417,11 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[5:6], v[0:1], off
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_f64_multi_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
@@ -439,6 +430,7 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    global_store_b64 v[5:6], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, double %arg0, double %arg1
   store double %select, ptr addrspace(1) %ptr
@@ -449,22 +441,22 @@ define double @fneg_xor_select_f64_multi_user(i1 %cond, double %arg0, double %ar
 define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_xor_select_i64_user_with_srcmods:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2
 ; GFX11-NEXT:    v_add_f64 v[0:1], -v[1:2], 2.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i64 %arg0, i64 %arg1
   %fneg = xor i64 %select, 9223372036854775808
@@ -476,7 +468,6 @@ define double @fneg_xor_select_i64_user_with_srcmods(i1 %cond, i64 %arg0, i64 %a
 define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, double %arg1) {
 ; GCN-LABEL: select_fneg_select_fneg_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -486,11 +477,11 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
 ; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_fneg_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -503,6 +494,7 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
 ; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg double %arg0
   %select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -514,7 +506,6 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
 define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg1) {
 ; GCN-LABEL: select_fneg_xor_select_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -524,11 +515,11 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -541,6 +532,7 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -552,7 +544,6 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1) {
 ; GFX7-LABEL: select_fneg_select_f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e64 v2, -v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -563,11 +554,11 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_select_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -576,11 +567,11 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX9-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -592,6 +583,7 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg half %arg0
   %select0 = select i1 %cond0, half %arg1, half %fneg0
@@ -603,7 +595,6 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg1) {
 ; GCN-LABEL: select_fneg_xor_select_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -612,11 +603,11 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GCN-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
@@ -628,6 +619,7 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i16 %arg0, -32768
   %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0
@@ -639,7 +631,6 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x half> %arg0, <2 x half> %arg1) {
 ; GFX7-LABEL: select_fneg_select_v2f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
@@ -672,11 +663,11 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_select_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -697,11 +688,11 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_select_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
@@ -725,6 +716,7 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg <2 x half> %arg0
   %select0 = select <2 x i1> %cond0, <2 x half> %arg1, <2 x half> %fneg0
@@ -736,7 +728,6 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, <2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: select_fneg_xor_select_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_xor_b32_e32 v5, 0xffff8000, v5
@@ -757,11 +748,11 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_xor_select_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -782,11 +773,11 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
@@ -810,6 +801,7 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor <2 x i16> %arg0, <i16 -32768, i16 -32768>
   %select0 = select <2 x i1> %cond0, <2 x i16> %arg1, <2 x i16> %fneg0
@@ -823,7 +815,6 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GCN-LABEL: cospiD_pattern0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
@@ -833,11 +824,11 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: cospiD_pattern0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
 ; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -847,6 +838,7 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 31, v5
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i32 %arg, 1
   %i3 = icmp eq i32 %i, 0
@@ -864,7 +856,6 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
 ; GCN-LABEL: cospiD_pattern1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
@@ -873,11 +864,11 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: cospiD_pattern1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
@@ -888,6 +879,7 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i32 %arg, 1
   %i3 = icmp eq i32 %i, 0
@@ -902,7 +894,6 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
 define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-LABEL: cospiD_pattern0_half:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -914,11 +905,11 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 31, v2
 ; GFX7-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: cospiD_pattern0_half:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -928,11 +919,11 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX9-NEXT:    v_xor_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: cospiD_pattern0_half:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v0
 ; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -945,6 +936,7 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i16 %arg, 1
   %i3 = icmp eq i16 %i, 0
@@ -962,34 +954,34 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-LABEL: cospiD_pattern1_half:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: cospiD_pattern1_half:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_lt_i16_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: cospiD_pattern1_half:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i16 %arg, 1
   %i3 = icmp eq i16 %i, 0
@@ -1003,14 +995,14 @@ define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
 define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) {
 ; GCN-LABEL: fneg_f64_bitcast_vector_i64_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_vector_i64_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i64 %arg to double
   %fneg = fneg double %bitcast
@@ -1020,14 +1012,14 @@ define double @fneg_f64_bitcast_vector_i64_to_f64(i64 %arg) {
 define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) {
 ; GCN-LABEL: fneg_f64_bitcast_vector_v2i32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_vector_v2i32_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast <2 x i32> %arg to double
   %fneg = fneg double %bitcast
@@ -1037,14 +1029,14 @@ define double @fneg_f64_bitcast_vector_v2i32_to_f64(<2 x i32> %arg) {
 define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) {
 ; GCN-LABEL: fneg_f64_bitcast_vector_v2f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_vector_v2f32_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast <2 x float> %arg to double
   %fneg = fneg double %bitcast
@@ -1054,7 +1046,6 @@ define double @fneg_f64_bitcast_vector_v2f32_to_f64(<2 x float> %arg) {
 define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
 ; GFX7-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1062,18 +1053,19 @@ define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_vector_v4i16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast <4 x i16> %arg to double
   %fneg = fneg double %bitcast
@@ -1083,7 +1075,6 @@ define double @fneg_f64_bitcast_vector_v4i16_to_f64(<4 x i16> %arg) {
 define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
 ; GFX7-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -1093,18 +1084,19 @@ define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_vector_v4f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast <4 x half> %arg to double
   %fneg = fneg double %bitcast
@@ -1114,14 +1106,14 @@ define double @fneg_f64_bitcast_vector_v4f16_to_f64(<4 x half> %arg) {
 define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x i32> poison, i32 %elt0, i32 0
   %insert.1 = insertelement <2 x i32> %insert.0, i32 %elt1, i32 1
@@ -1133,14 +1125,14 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64(i32 %elt0, i32 %elt1)
 define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %elt1) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float %elt1, i32 1
@@ -1152,7 +1144,6 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64(float %elt0, float %el
 define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1, i16 %elt2, i16 %elt3) {
 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -1160,24 +1151,25 @@ define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1,
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x i16> poison, i16 %elt0, i32 0
   %insert.1 = insertelement <4 x i16> %insert.0, i16 %elt1, i32 1
@@ -1191,7 +1183,6 @@ define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1,
 define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1, half %elt2, half %elt3) {
 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -1201,24 +1192,25 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s4
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x half> poison, half %elt0, i32 0
   %insert.1 = insertelement <4 x half> %insert.0, half %elt1, i32 1
@@ -1232,30 +1224,30 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
 define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat %elt1, bfloat %elt2, bfloat %elt3) {
 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v2, v2, v3, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x bfloat> poison, bfloat %elt0, i32 0
   %insert.1 = insertelement <4 x bfloat> %insert.0, bfloat %elt1, i32 1
@@ -1269,14 +1261,14 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
 define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt0, i32 %elt1, double %fp.val) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x i32> poison, i32 %elt0, i32 0
   %insert.1 = insertelement <2 x i32> %insert.0, i32 %elt1, i32 1
@@ -1289,20 +1281,20 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user(i32 %elt
 define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user(i32 %elt0, i32 %elt1, double %fp.val0, double %fp.val1) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[6:7], -v[0:1], v[2:3]
 ; GCN-NEXT:    v_mul_f64 v[2:3], v[4:5], -v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v6
 ; GCN-NEXT:    v_mov_b32_e32 v1, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modifier_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f64 v[6:7], -v[0:1], v[2:3]
 ; GFX11-NEXT:    v_mul_f64 v[2:3], v[4:5], -v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x i32> poison, i32 %elt0, i32 0
   %insert.1 = insertelement <2 x i32> %insert.0, i32 %elt1, i32 1
@@ -1318,17 +1310,17 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2i32_to_f64_multi_modi
 define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source(i32 %elt0, i32 %elt1, double %fp.val) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_neg_source:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.elt1 = xor i32 %elt1, -2147483648
   %insert.0 = insertelement <2 x i32> poison, i32 %elt0, i32 0
@@ -1342,14 +1334,14 @@ define double @fneg_f64_bitcast_build_vector_v2i32_to_f64_modifier_user_integer_
 define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float %elt0, float %elt1) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v1, -2.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_f32_e32 v1, -2.0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fadd = fadd nsz nnan float %elt1, 2.0
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
@@ -1362,26 +1354,24 @@ define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float
 define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(float %elt0, float %elt1, ptr addrspace(1) %ptr) {
 ; GFX7-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX7-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float %elt1, i32 1
@@ -1394,19 +1384,19 @@ define double @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_user(fl
 define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user(float %elt0, float %elt1, <2 x float> %arg.v2f32) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v2, v0, v2
 ; GCN-NEXT:    v_add_f32_e32 v3, v1, v3
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_source_foldable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80000000, v1
 ; GFX11-NEXT:    v_dual_add_f32 v2, v0, v2 :: v_dual_add_f32 v3, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float %elt1, i32 1
@@ -1421,18 +1411,18 @@ define { double, <2 x float> } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitca
 define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user(float %elt0, float %elt1) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float %elt1, i32 1
@@ -1446,16 +1436,16 @@ define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_us
 define { double, double } @fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user(float %elt0, float %elt1, double %arg.f64) {
 ; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_to_f64_bitcast_foldable_user:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <2 x float> poison, float %elt0, i32 0
   %insert.1 = insertelement <2 x float> %insert.0, float %elt1, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
index 4a4a576ceafada6..82cc22f78405c91 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
@@ -4,16 +4,17 @@
 define float @fold_abs_in_branch(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GFX10-NEXT:    v_add_f32_e64 v0, |v1|, |v1|
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_mul_f32_e64 v0, 0x3e4ccccd, |v1|
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -35,11 +36,11 @@ exit:
 define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch_multiple_users:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e64 v1, |v0|, |v0|
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v1
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_mul_f32_e64 v1, 0x3e4ccccd, |v0|
@@ -47,6 +48,7 @@ define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) {
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_add_f32_e64 v0, |v0|, 2.0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -70,13 +72,13 @@ exit:
 define float @fold_abs_in_branch_undef(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch_undef:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e64 v0, |s4|, |s4|
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v0
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB2_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_mul_f32_e64 v0, 0x3e4ccccd, |s4|
 ; GFX10-NEXT:  .LBB2_2: ; %exit
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -98,13 +100,13 @@ exit:
 define float @fold_abs_in_branch_poison(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch_poison:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e64 v0, |s4|, |s4|
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v0
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB3_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_mul_f32_e64 v0, 0x3e4ccccd, |s4|
 ; GFX10-NEXT:  .LBB3_2: ; %exit
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -126,16 +128,17 @@ exit:
 define float @fold_abs_in_branch_fabs(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch_fabs:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GFX10-NEXT:    v_add_f32_e64 v0, |v1|, |v1|
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_mul_f32_e64 v0, 0x3e4ccccd, |v1|
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -158,11 +161,11 @@ exit:
 define float @fold_abs_in_branch_phi(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_abs_in_branch_phi:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e64 v0, |v0|, |v0|
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX10-NEXT:  ; %bb.1: ; %header.preheader
@@ -175,6 +178,7 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) {
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB5_2
 ; GFX10-NEXT:  .LBB5_3: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -203,11 +207,11 @@ exit:
 define float @fold_neg_in_branch(float %arg1, float %arg2) {
 ; GFX10-LABEL: fold_neg_in_branch:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_rcp_f32_e64 v1, -v0
@@ -215,6 +219,7 @@ define float @fold_neg_in_branch(float %arg1, float %arg2) {
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 0a74efe79bee7be..337e73f9b6750cf 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -11,7 +11,6 @@ declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
 define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 ; VI-LABEL: fmul_pow2_4xfloat:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
 ; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
@@ -24,11 +23,11 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
 ; VI-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow2_4xfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
@@ -41,11 +40,11 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41100000, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x41100000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x41100000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow2_4xfloat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 1
@@ -59,6 +58,7 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1
 ; GFX11-NEXT:    v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
   %p2_f = uitofp <4 x i32> %p2 to <4 x float>
@@ -69,30 +69,30 @@ define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
 define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
 ; VI-LABEL: fmul_pow2_ldexp_4xfloat:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x41100000
 ; VI-NEXT:    v_ldexp_f32 v0, s4, v0
 ; VI-NEXT:    v_ldexp_f32 v1, s4, v1
 ; VI-NEXT:    v_ldexp_f32 v2, s4, v2
 ; VI-NEXT:    v_ldexp_f32 v3, s4, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow2_ldexp_4xfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
 ; GFX10-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
 ; GFX10-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
 ; GFX10-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow2_ldexp_4xfloat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, 0x41100000, v0
 ; GFX11-NEXT:    v_ldexp_f32 v1, 0x41100000, v1
 ; GFX11-NEXT:    v_ldexp_f32 v2, 0x41100000, v2
 ; GFX11-NEXT:    v_ldexp_f32 v3, 0x41100000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
   ret <4 x float> %r
@@ -101,7 +101,6 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
 define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
 ; VI-LABEL: fdiv_pow2_4xfloat:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
@@ -110,11 +109,11 @@ define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 0x41100000, v1
 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, 0x41100000, v2
 ; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0x41100000, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow2_4xfloat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
@@ -123,11 +122,11 @@ define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x41100000, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0x41100000, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 0x41100000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow2_4xfloat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 23, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 23, v2
@@ -138,6 +137,7 @@ define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x41100000, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 0x41100000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
   %p2_f = uitofp <4 x i32> %p2 to <4 x float>
@@ -150,7 +150,6 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
 define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; VI-LABEL: fmul_pow2_8xhalf:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, 1
 ; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, 1
 ; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -181,11 +180,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; VI-NEXT:    v_or_b32_e32 v1, v7, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow2_8xhalf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
@@ -206,11 +205,11 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow2_8xhalf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
@@ -240,6 +239,7 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
   %p2_f = uitofp <8 x i16> %p2 to <8 x half>
@@ -250,7 +250,6 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; VI-LABEL: fmul_pow2_ldexp_8xhalf:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x7000
 ; VI-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
 ; VI-NEXT:    v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -264,11 +263,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; VI-NEXT:    v_or_b32_e32 v1, v7, v1
 ; VI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
 ; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
 ; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
@@ -282,11 +281,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
 ; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
 ; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v2
@@ -305,6 +304,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
   ret <8 x half> %r
@@ -313,7 +313,6 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 ; VI-LABEL: fdiv_pow2_8xhalf:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, 10
 ; VI-NEXT:    v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7000
@@ -336,11 +335,11 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v8
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v7
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v5
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow2_8xhalf:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
@@ -349,11 +348,11 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-NEXT:    v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow2_8xhalf:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1]
@@ -364,6 +363,7 @@ define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
   %p2_f = uitofp <8 x i16> %p2 to <8 x half>
@@ -420,7 +420,6 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -429,22 +428,22 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
 ; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -454,6 +453,7 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 1, %cnt
   %conv = uitofp i64 %shl to double
@@ -510,7 +510,6 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -519,22 +518,22 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
 ; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -544,6 +543,7 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 2, %cnt
   %conv = uitofp i64 %shl to double
@@ -599,29 +599,28 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_select:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_select:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 2, 1, vcc_lo
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_select:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
@@ -630,6 +629,7 @@ define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i32 2, %cnt
   %shl1 = shl nuw i32 1, %cnt
@@ -691,7 +691,6 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_fly_pow_mul_min_pow2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; VI-NEXT:    s_mov_b64 s[4:5], 0x2000
 ; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
@@ -707,11 +706,11 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v1
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_fly_pow_mul_min_pow2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
@@ -725,11 +724,11 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_fly_pow_mul_min_pow2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
@@ -748,6 +747,7 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl8 = shl nuw i64 8, %cnt
   %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
@@ -816,32 +816,32 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_mul_max_pow2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    s_mov_b32 s5, 0x40080000
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_mul_max_pow2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_mul_max_pow2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i16 2, %cnt
   %shl1 = shl nuw i16 1, %cnt
@@ -896,7 +896,6 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -905,22 +904,22 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
 ; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -930,6 +929,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 %v, %cnt
   %conv = uitofp i64 %shl to double
@@ -1024,7 +1024,6 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
 ; VI-NEXT:    v_ffbh_u32_e32 v3, v2
 ; VI-NEXT:    v_min_u32_e32 v5, 32, v3
@@ -1045,11 +1044,11 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; VI-NEXT:    v_ldexp_f32 v0, v0, v2
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
@@ -1070,11 +1069,11 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x41700000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x41700000, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1102,6 +1101,7 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x float>
@@ -1166,7 +1166,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
@@ -1181,11 +1180,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
@@ -1198,11 +1197,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1220,6 +1219,7 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1288,7 +1288,6 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e64 v3, v3, 2
 ; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
 ; VI-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
@@ -1305,11 +1304,11 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; VI-NEXT:    v_add_f32_e32 v1, v1, v5
 ; VI-NEXT:    v_add_f32_e32 v2, v2, v6
 ; VI-NEXT:    v_add_f32_e32 v3, v3, v7
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
@@ -1326,11 +1325,11 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 2
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v1, v1, 2
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v2, 2
@@ -1347,6 +1346,7 @@ define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5
 ; GFX11-NEXT:    v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
   %conv = uitofp <4 x i32> %shl to <4 x float>
@@ -1412,7 +1412,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v1
@@ -1429,11 +1428,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; VI-NEXT:    s_mov_b32 s4, 0
 ; VI-NEXT:    s_mov_b32 s5, 0x402c0000
 ; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
@@ -1446,11 +1445,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1468,6 +1467,7 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1532,7 +1532,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
@@ -1547,11 +1546,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
@@ -1564,11 +1563,11 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1586,6 +1585,7 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
 ; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1748,7 +1748,6 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 2
 ; VI-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 2
@@ -1758,21 +1757,21 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x4b80, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -1781,6 +1780,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
   %conv = uitofp <2 x i16> %shl to <2 x half>
@@ -1837,7 +1837,6 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
@@ -1846,11 +1845,11 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
 ; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
 ; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
@@ -1859,11 +1858,11 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
 ; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
 ; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
@@ -1875,6 +1874,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 1, %cnt
   %conv = uitofp i64 %shl to double
@@ -1926,28 +1926,27 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fmul_pow_shl_cnt_safe:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; VI-NEXT:    s_mov_b32 s4, 0xff5f3992
 ; VI-NEXT:    s_mov_b32 s5, 0x7befffff
 ; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_safe:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
 ; GFX10-NEXT:    s_mov_b32 s4, 0xff5f3992
 ; GFX10-NEXT:    s_mov_b32 s5, 0x7befffff
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_safe:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
 ; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
 ; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
@@ -1956,6 +1955,7 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to double
@@ -2026,7 +2026,6 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_vec:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
 ; VI-NEXT:    v_sub_u32_e64 v0, vcc, 0, 0
@@ -2034,22 +2033,22 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 20, v2
 ; VI-NEXT:    v_sub_u32_e64 v2, vcc, 0, 0
 ; VI-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
@@ -2057,6 +2056,7 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -2155,30 +2155,30 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 1.0, v0
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 1.0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 1.0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 23, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 1.0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 1.0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x float>
@@ -2248,7 +2248,6 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; VI-NEXT:    s_mov_b32 s6, 0xc1100000
 ; VI-NEXT:    v_ffbh_u32_e32 v2, v1
@@ -2270,11 +2269,11 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
@@ -2295,11 +2294,11 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
@@ -2329,6 +2328,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 8, %cnt
   %conv = uitofp i64 %shl to float
@@ -2380,7 +2380,6 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; VI-NEXT:    s_mov_b32 s6, 0xc1100000
 ; VI-NEXT:    v_xor_b32_e32 v2, v0, v1
@@ -2406,11 +2405,11 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
@@ -2435,11 +2434,11 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
@@ -2475,6 +2474,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i64 8, %cnt
   %conv = sitofp i64 %shl to float
@@ -2530,7 +2530,6 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 31, v0
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
@@ -2555,11 +2554,11 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 31, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
@@ -2584,11 +2583,11 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 31, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
@@ -2624,6 +2623,7 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
@@ -2699,7 +2699,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    s_movk_i32 s4, 0x7000
@@ -2709,11 +2708,11 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x46000000, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX10-NEXT:    s_mov_b32 s4, 0x46000000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
@@ -2722,11 +2721,11 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX10-NEXT:    v_fma_mixlo_f16 v1, v1, s4, 0
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2739,6 +2738,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX11-NEXT:    v_fma_mixlo_f16 v1, v1, s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to half
@@ -2817,24 +2817,24 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 10, v0
 ; VI-NEXT:    v_sub_u16_e32 v0, 0x7000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 10, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
@@ -2913,24 +2913,24 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 10, v0
 ; VI-NEXT:    v_sub_u16_e32 v0, 0x4800, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 10, v0
 ; GFX10-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
@@ -3009,7 +3009,6 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -3017,11 +3016,11 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; VI-NEXT:    v_add_f32_e32 v1, v1, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
 ; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
@@ -3029,11 +3028,11 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
@@ -3045,6 +3044,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
@@ -3096,31 +3096,31 @@ define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x36a00000
 ; VI-NEXT:    v_sub_u32_e64 v2, vcc, 0, 0
 ; VI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
 ; GFX10-NEXT:    v_sub_co_u32 v1, vcc_lo, 0, 0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 20, v0
 ; GFX11-NEXT:    v_sub_co_u32 v1, vcc_lo, 0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to double
@@ -3172,7 +3172,6 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    s_mov_b32 s6, 0x10fffff8
@@ -3187,11 +3186,11 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8
@@ -3205,11 +3204,11 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
@@ -3229,6 +3228,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 0x10fffff8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to float
@@ -3280,24 +3280,24 @@ define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    retq
 ; VI-LABEL: fdiv_pow_shl_cnt32_okay:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0x11000000, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt32_okay:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x11000000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt32_okay:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x11000000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to float
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 4bd9991912977b3..92bcdf71800aa34 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -61,9 +61,8 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
 define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret float %ret
@@ -72,13 +71,13 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
 ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret float %ret
@@ -101,9 +100,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
 define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
@@ -126,9 +124,8 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
 define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
@@ -151,9 +148,8 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
 define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
@@ -175,9 +171,8 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2f16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret <2 x half> %ret
@@ -203,12 +198,12 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
 ; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
 ; GFX940:       ; %bb.0:
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index ee5e0e99510577b..12316634fd4d00a 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1325,16 +1325,14 @@ main_body:
 define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1344,7 +1342,6 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB47_1: ; %atomicrmw.start
@@ -1360,6 +1357,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1370,13 +1368,13 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1386,24 +1384,24 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1413,7 +1411,6 @@ main_body:
 define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, double %data) #1 {
 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB49_1: ; %atomicrmw.start
@@ -1429,23 +1426,25 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB49_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1455,16 +1454,14 @@ main_body:
 define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fmax_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1474,16 +1471,14 @@ main_body:
 define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) {
 ; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: global_atomic_fmin_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1654,7 +1649,6 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB56_1: ; %atomicrmw.start
@@ -1670,6 +1664,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB56_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1680,13 +1675,13 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
@@ -1696,24 +1691,24 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    s_waitcnt expcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1723,7 +1718,6 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB58_1: ; %atomicrmw.start
@@ -1740,6 +1734,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB58_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1750,14 +1745,13 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1794,16 +1788,14 @@ main_body:
 define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
@@ -1882,16 +1874,14 @@ main_body:
 define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fmin_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
@@ -1928,16 +1918,14 @@ main_body:
 define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) {
 ; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: flat_atomic_fmax_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
-; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
@@ -1972,20 +1960,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2020,20 +2006,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
@@ -2150,21 +2134,19 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data) #1 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2174,20 +2156,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
@@ -2197,20 +2177,18 @@ main_body:
 define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
 ; GFX90A:       ; %bb.0: ; %main_body
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
 ; GFX940:       ; %bb.0: ; %main_body
-; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX940-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
-; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 main_body:
   %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
index 728b92b3f73ff24..babafe1866b6ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
@@ -10,25 +10,25 @@
 define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
 ; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -41,22 +41,22 @@ entry:
 define double @fadd_fpext_fmul_f16_to_f64(half %x, half %y, double %z) #0 {
 ; GFX11-LABEL: fadd_fpext_fmul_f16_to_f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fadd_fpext_fmul_f16_to_f64:
 ; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX89-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -69,19 +69,19 @@ entry:
 define double @fadd_fpext_fmul_f32_to_f64(float %x, float %y, double %z) #0 {
 ; GFX11-LABEL: fadd_fpext_fmul_f32_to_f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fadd_fpext_fmul_f32_to_f64:
 ; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX89-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul float %x, %y
@@ -94,25 +94,25 @@ entry:
 define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 {
 ; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -126,28 +126,28 @@ entry:
 define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
 ; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %u, %v
@@ -162,28 +162,28 @@ entry:
 define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
 ; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %u, %v
@@ -196,28 +196,28 @@ entry:
 define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
 ; GFX11-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %u, %v
@@ -233,28 +233,28 @@ entry:
 define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
 ; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
@@ -267,28 +267,28 @@ entry:
 define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
 ; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v2, v3, v4 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v2, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
@@ -303,29 +303,29 @@ entry:
 define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fadd_fpext_fmuladd_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmuladd_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fpext_fmuladd_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
@@ -338,29 +338,29 @@ entry:
 define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fadd_fpext_fma_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
@@ -373,29 +373,29 @@ entry:
 define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fadd_fpext_fma_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fadd_fpext_fma_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX9-F32DENORM-NEXT:    v_fma_f16 v1, v1, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-F32DENORM-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %u, %v
@@ -410,25 +410,25 @@ entry:
 define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
 ; GFX11-LABEL: fsub_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -442,31 +442,31 @@ entry:
 define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 {
 ; GFX11-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
 ; GFX11-F32FLUSH:       ; %bb.0: ; %entry
-; GFX11-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
+; GFX11-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
 ; GFX11-F32DENORM:       ; %bb.0: ; %entry
-; GFX11-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-F32DENORM-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul contract half %y, %z
@@ -480,25 +480,25 @@ entry:
 define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
 ; GFX11-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -513,25 +513,25 @@ entry:
 define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
 ; GFX11-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, v0, -v1, -v2 op_sel_hi:[1,1,0]
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e64 v0, v0, -v1
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %x, %y
@@ -546,28 +546,28 @@ entry:
 define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v2, v3, v4, -v2 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GFX9-F32FLUSH-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v0, v0, v1, v3
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul reassoc half %u, %v
@@ -583,22 +583,22 @@ entry:
 define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f16_e32 v3, v0, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
 ; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX89-NEXT:    v_fma_f16 v0, v0, v1, v3
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX89-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %u, %v
@@ -613,27 +613,27 @@ entry:
 define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
-; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    v_mad_mix_f32 v0, -v3, v4, v0 op_sel_hi:[1,1,0]
 ; GFX9-F32FLUSH-NEXT:    v_mad_f32 v0, -v1, v2, v0
+; GFX9-F32FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32DENORM-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
 ; GFX9-F32DENORM:       ; %bb.0: ; %entry
-; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX9-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX9-F32DENORM-NEXT:    v_fma_f32 v1, v1, v2, v3
 ; GFX9-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul reassoc half %u, %v
@@ -649,22 +649,22 @@ entry:
 define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
 ; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
 ; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mul_f16_e32 v3, v3, v4
 ; GFX89-NEXT:    v_fma_f16 v1, v1, v2, v3
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX89-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = fmul half %u, %v
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index fc1e079a38a80cb..0c1807d48d12080 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -9,52 +9,52 @@
 define float @v_pow_f32(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_f32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call float @llvm.pow.f32(float %x, float %y)
   ret float %pow
@@ -63,62 +63,61 @@ define float @v_pow_f32(float %x, float %y) {
 define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6-LABEL: v_pow_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_log_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v2, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v1, v3, v1
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v1, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_log_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_log_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
@@ -126,6 +125,7 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    v_exp_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y)
   ret <2 x float> %pow
@@ -134,7 +134,6 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 define half @v_pow_f16(half %x, half %y) {
 ; GFX6-LABEL: v_pow_f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -142,55 +141,55 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_f16:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -200,6 +199,7 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call half @llvm.pow.f16(half %x, half %y)
   ret half %pow
@@ -208,7 +208,6 @@ define half @v_pow_f16(half %x, half %y) {
 define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -223,11 +222,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -241,11 +240,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -259,11 +258,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -277,11 +276,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -295,11 +294,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -321,6 +320,7 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
   ret <2 x half> %pow
@@ -329,7 +329,6 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -348,11 +347,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -366,11 +365,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -384,11 +383,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -402,11 +401,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -420,11 +419,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -446,6 +445,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@@ -455,7 +455,6 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -474,11 +473,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -492,11 +491,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -510,11 +509,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -528,11 +527,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -546,11 +545,11 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -572,6 +571,7 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@@ -581,7 +581,6 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -604,11 +603,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v3, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX8-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -622,11 +621,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -640,11 +639,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -658,11 +657,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -676,11 +675,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
@@ -702,6 +701,7 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y
@@ -718,52 +718,52 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_f32_fabs_lhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %pow = call float @llvm.pow.f32(float %fabs.x, float %y)
@@ -773,52 +773,52 @@ define float @v_pow_f32_fabs_lhs(float %x, float %y) {
 define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_f32_fabs_rhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, |v1|, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, |v1|, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y)
   %pow = call float @llvm.pow.f32(float %x, float %fabs.y)
@@ -828,52 +828,52 @@ define float @v_pow_f32_fabs_rhs(float %x, float %y) {
 define float @v_pow_f32_fabs_lhs_rhs(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX6-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX8-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX9-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, |v1|, v0
 ; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX10-NEXT:    v_mul_legacy_f32_e64 v0, |v1|, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e64 v0, |v0|
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e64 v0, |v1|, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %fabs.y = call float @llvm.fabs.f32(float %y)
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 3a0b8259d084965..8cb3d6198cf8e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -48,7 +48,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
 ;
 ; GFX6-LABEL: safe_math_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; GFX6-NEXT:    v_min_f32_e32 v4, 0x3f7fffff, v4
@@ -62,12 +61,11 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_fract_f32_e32 v4, v0
@@ -78,30 +76,29 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) nocapture writeonly
 ; GFX7-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX8-NEXT:    v_fract_f32_e32 v4, v0
 ; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s4
 ; GFX8-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v3, v0
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX11-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -138,7 +135,6 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
 ;
 ; GFX6-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -149,12 +145,11 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
@@ -162,24 +157,23 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -207,7 +201,6 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ;
 ; GFX6-LABEL: no_nan_check_math_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
@@ -219,12 +212,11 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: no_nan_check_math_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
@@ -236,12 +228,11 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: no_nan_check_math_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -249,12 +240,11 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: no_nan_check_math_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -262,6 +252,7 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) nocapture w
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    v_min_f32_e32 v4, 0x3f7fffff, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -291,28 +282,28 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_f32_nonans:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f32_nonans:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f32_nonans:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f32_nonans:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -332,35 +323,35 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ;
 ; GFX6-LABEL: basic_fract_f32_flags_minnum:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f32_flags_minnum:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f32_flags_minnum:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f32_flags_minnum:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -386,28 +377,28 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_f32_flags_fsub:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f32_flags_fsub:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f32_flags_fsub:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f32_flags_fsub:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -438,34 +429,34 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_v2f32_nonans:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v2, v0
 ; GFX6-NEXT:    v_floor_f32_e32 v3, v1
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_v2f32_nonans:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    v_fract_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_v2f32_nonans:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    v_fract_f32_e32 v1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_v2f32_nonans:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    v_fract_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
@@ -495,7 +486,6 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ;
 ; GFX6-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_sub_f32_e32 v3, v0, v3
@@ -504,12 +494,11 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v3
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
@@ -518,27 +507,26 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX7-NEXT:    v_sub_f32_e32 v3, v0, v3
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v3, v0, v3
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v3, v0, v3
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -565,28 +553,28 @@ define float @nnan_minnum_fract_f32(float %x) {
 ;
 ; GFX6-LABEL: nnan_minnum_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: nnan_minnum_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: nnan_minnum_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: nnan_minnum_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -608,35 +596,35 @@ define float @nnan_fsub_fract_f32(float %x) {
 ;
 ; GFX6-LABEL: nnan_fsub_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: nnan_fsub_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: nnan_fsub_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: nnan_fsub_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -656,35 +644,35 @@ define float @nnan_floor_fract_f32(float %x) {
 ;
 ; GFX6-LABEL: nnan_floor_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: nnan_floor_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: nnan_floor_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: nnan_floor_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call nnan float @llvm.floor.f32(float %x)
@@ -710,28 +698,28 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: nnan_src_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: nnan_src_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: nnan_src_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: nnan_src_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -752,35 +740,35 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: not_fract_f32_wrong_const:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: not_fract_f32_wrong_const:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: not_fract_f32_wrong_const:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: not_fract_f32_wrong_const:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -801,35 +789,35 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: not_fract_f32_swapped_fsub:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: not_fract_f32_swapped_fsub:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: not_fract_f32_swapped_fsub:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: not_fract_f32_swapped_fsub:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -850,35 +838,35 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: not_fract_f32_not_floor:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: not_fract_f32_not_floor:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: not_fract_f32_not_floor:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: not_fract_f32_not_floor:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_trunc_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.trunc.f32(float %x)
@@ -899,35 +887,35 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ;
 ; GFX6-LABEL: not_fract_f32_different_floor:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: not_fract_f32_different_floor:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: not_fract_f32_different_floor:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: not_fract_f32_different_floor:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %y)
@@ -948,35 +936,35 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: not_fract_f32_maxnum:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: not_fract_f32_maxnum:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: not_fract_f32_maxnum:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: not_fract_f32_maxnum:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1029,30 +1017,30 @@ define float @select_nan_fract_f32(float %x) {
 ;
 ; GFX6-LABEL: select_nan_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: select_nan_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: select_nan_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_nan_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1082,30 +1070,30 @@ define float @commuted_select_nan_fract_f32(float %x) {
 ;
 ; GFX6-LABEL: commuted_select_nan_fract_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: commuted_select_nan_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: commuted_select_nan_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: commuted_select_nan_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1129,37 +1117,36 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ;
 ; GFX6-LABEL: wrong_commuted_nan_select_f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: wrong_commuted_nan_select_f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: wrong_commuted_nan_select_f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: wrong_commuted_nan_select_f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1167,6 +1154,7 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX11-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1202,34 +1190,34 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_f16_nonan:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f16_nonan:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f16_nonan:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
@@ -1268,7 +1256,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_v2f16_nonan:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1279,11 +1266,11 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fe000, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_v2f16_nonan:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1294,24 +1281,25 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX7-NEXT:    v_min_f32_e32 v1, 0x3f7fe000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_v2f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f16_e32 v1, v0
 ; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_v2f16_nonan:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fract_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
@@ -1337,7 +1325,6 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
 ;
 ; GFX6-LABEL: basic_fract_f64_nanans:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[2:3], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v4, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v5, 0x3fefffff
@@ -1350,24 +1337,25 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
 ; GFX6-NEXT:    s_mov_b32 s5, 0x3fefffff
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: basic_fract_f64_nanans:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: basic_fract_f64_nanans:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: basic_fract_f64_nanans:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
@@ -1409,7 +1397,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
 ;
 ; GFX6-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
@@ -1423,12 +1410,11 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
@@ -1442,24 +1428,23 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) nocapture
 ; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX7-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f16_e32 v3, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX8-NEXT:    global_store_short v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f16_e32 v3, v0
 ; GFX11-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
@@ -1493,7 +1478,6 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
 ;
 ; GFX6-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0x3fefffff
@@ -1514,12 +1498,11 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_floor_f64_e32 v[4:5], v[0:1]
 ; GFX7-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1527,24 +1510,23 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) nocap
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_floor_f64_e32 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_floor_f64_e32 v[4:5], v[0:1]
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
@@ -1575,30 +1557,30 @@ define float @select_nan_fract_f32_flags_select(float %x) {
 ;
 ; GFX6-LABEL: select_nan_fract_f32_flags_select:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: select_nan_fract_f32_flags_select:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: select_nan_fract_f32_flags_select:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_nan_fract_f32_flags_select:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1628,30 +1610,30 @@ define float @select_nan_fract_f32_flags_minnum(float %x) {
 ;
 ; GFX6-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v1, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call float @llvm.floor.f32(float %x)
@@ -1695,7 +1677,6 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ;
 ; GFX6-LABEL: safe_math_fract_v2f32:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_floor_f32_e32 v5, v1
 ; GFX6-NEXT:    v_floor_f32_e32 v4, v0
 ; GFX6-NEXT:    v_sub_f32_e32 v6, v1, v5
@@ -1716,12 +1697,11 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, s[8:9]
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
 ; GFX7-NEXT:    v_fract_f32_e32 v6, v0
 ; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
@@ -1736,12 +1716,11 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX7-NEXT:    v_floor_f32_e32 v5, v1
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX8-NEXT:    v_fract_f32_e32 v6, v0
 ; GFX8-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s4
@@ -1752,12 +1731,11 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX8-NEXT:    v_floor_f32_e32 v5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v6, v0
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX11-NEXT:    v_fract_f32_e32 v7, v1
@@ -1767,6 +1745,7 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
 ; GFX11-NEXT:    v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1|
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
@@ -1809,7 +1788,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ;
 ; GFX6-LABEL: safe_math_fract_f64:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0x3fefffff
@@ -1835,12 +1813,11 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v7, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f64:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s4, 0
 ; GFX7-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX7-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
@@ -1853,12 +1830,11 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GFX7-NEXT:    buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f64:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0
 ; GFX8-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX8-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
@@ -1867,18 +1843,18 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[6:7], off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
 ; GFX11-NEXT:    v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
 ; GFX11-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[6:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call double @llvm.floor.f64(double %x)
@@ -1935,7 +1911,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ;
 ; GFX6-LABEL: safe_math_fract_f16:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
@@ -1952,12 +1927,11 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX6-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_f16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -1974,30 +1948,29 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX7-NEXT:    v_cmp_neq_f32_e64 vcc, |v0|, s8
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_f16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX8-NEXT:    v_fract_f16_e32 v4, v0
 ; GFX8-NEXT:    v_cmp_neq_f16_e64 vcc, |v0|, s4
 ; GFX8-NEXT:    v_floor_f16_e32 v3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX8-NEXT:    global_store_short v[1:2], v3, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_f16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f16_e32 v3, v0
 ; GFX11-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
 ; GFX11-NEXT:    v_floor_f16_e32 v4, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b16 v[1:2], v4, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
@@ -2059,7 +2032,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ;
 ; GFX6-LABEL: safe_math_fract_v2f16:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s8, 0x7f800000
@@ -2090,12 +2062,11 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX6-NEXT:    v_or_b32_e32 v7, v9, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dword v7, v[2:3], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_v2f16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    s_mov_b32 s8, 0x7f800000
@@ -2126,12 +2097,11 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX7-NEXT:    v_or_b32_e32 v7, v9, v7
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX7-NEXT:    buffer_store_dword v7, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_v2f16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s6, 0x204
 ; GFX8-NEXT:    v_floor_f16_e32 v4, v3
@@ -2145,12 +2115,11 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, 0, s[4:5]
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX8-NEXT:    global_store_dword v[1:2], v4, off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_fract_f16_e32 v6, v0
 ; GFX11-NEXT:    v_floor_f16_e32 v5, v0
@@ -2165,6 +2134,7 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
@@ -2212,7 +2182,6 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ;
 ; GFX6-LABEL: safe_math_fract_v2f64:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_fract_f64_e32 v[6:7], v[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v10, -1
 ; GFX6-NEXT:    v_mov_b32_e32 v11, 0x3fefffff
@@ -2251,12 +2220,11 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, s[8:9]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, s[8:9]
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: safe_math_fract_v2f64:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0x204
 ; GFX7-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
 ; GFX7-NEXT:    v_cmp_class_f64_e64 s[8:9], v[0:1], s4
@@ -2273,12 +2241,11 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[10:11]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[10:11]
 ; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: safe_math_fract_v2f64:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s6, 0x204
 ; GFX8-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
 ; GFX8-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], s6
@@ -2291,12 +2258,11 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s[6:7]
 ; GFX8-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: safe_math_fract_v2f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[10:11], v[0:1]
 ; GFX11-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x204
 ; GFX11-NEXT:    v_fract_f64_e32 v[12:13], v[2:3]
@@ -2308,6 +2274,7 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v12, 0, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %x)
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 29daff566922c6c..9c956305f4b79a3 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -9,9 +9,9 @@ declare hidden void @external_void_func_void() #0
 define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-LABEL: callee_with_stack_and_call:
 ; SPILL-TO-VGPR:       ; %bb.0:
-; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s4, s33
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s32
+; SPILL-TO-VGPR-NEXT:    s_waitcnt expcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; SPILL-TO-VGPR-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[8:9]
@@ -39,13 +39,13 @@ define void @callee_with_stack_and_call() #0 {
 ;
 ; NO-SPILL-TO-VGPR-LABEL: callee_with_stack_and_call:
 ; NO-SPILL-TO-VGPR:       ; %bb.0:
-; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s4, s33
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s32
 ; NO-SPILL-TO-VGPR-NEXT:    v_mov_b32_e32 v0, s4
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0x800
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
+; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt expcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s30, 0
diff --git a/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll b/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
index b27bf69d6627504..818ffeaf0fda0a3 100644
--- a/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/frexp-constant-fold.ll
@@ -9,10 +9,10 @@ declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>)
 define { float, i32 } @frexp_frexp(float %x) {
 ; CHECK-LABEL: frexp_frexp:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
   %frexp0.0 = extractvalue { float, i32 } %frexp0, 0
@@ -23,13 +23,13 @@ define { float, i32 } @frexp_frexp(float %x) {
 define { <2 x float>, <2 x i32> } @frexp_frexp_vector(<2 x float> %x) {
 ; CHECK-LABEL: frexp_frexp_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v4, v0
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, v4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %frexp0 = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %x)
   %frexp0.0 = extractvalue { <2 x float>, <2 x i32> } %frexp0, 0
@@ -40,9 +40,9 @@ define { <2 x float>, <2 x i32> } @frexp_frexp_vector(<2 x float> %x) {
 define { float, i32 } @frexp_frexp_const(float %x) {
 ; CHECK-LABEL: frexp_frexp_const:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3f280000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float 42.0)
   %frexp0.0 = extractvalue { float, i32 } %frexp0, 0
@@ -71,9 +71,9 @@ define { <2 x float>, <2 x i32> } @frexp_poison_vector() {
 define { float, i32 } @frexp_undef() {
 ; CHECK-LABEL: frexp_undef:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v0, s4
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v1, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float undef)
   ret { float, i32 } %ret
@@ -81,11 +81,11 @@ define { float, i32 } @frexp_undef() {
 define { <2 x float>, <2 x i32> } @frexp_undef_vector() {
 ; CHECK-LABEL: frexp_undef_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v0, s4
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> undef)
   ret { <2 x float>, <2 x i32> } %ret
@@ -94,11 +94,11 @@ define { <2 x float>, <2 x i32> } @frexp_undef_vector() {
 define { <2 x float>, <2 x i32> } @frexp_zero_vector() {
 ; CHECK-LABEL: frexp_zero_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> zeroinitializer)
   ret { <2 x float>, <2 x i32> } %ret
@@ -107,11 +107,11 @@ define { <2 x float>, <2 x i32> } @frexp_zero_vector() {
 define { <2 x float>, <2 x i32> } @frexp_zero_negzero_vector() {
 ; CHECK-LABEL: frexp_zero_negzero_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfrev_b32_e32 v1, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0.0, float -0.0>)
   ret { <2 x float>, <2 x i32> } %ret
@@ -120,7 +120,6 @@ define { <2 x float>, <2 x i32> } @frexp_zero_negzero_vector() {
 define { <4 x float>, <4 x i32> } @frexp_nonsplat_vector() {
 ; CHECK-LABEL: frexp_nonsplat_vector:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v2, s4
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v6, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0.5
@@ -129,6 +128,7 @@ define { <4 x float>, <4 x i32> } @frexp_nonsplat_vector() {
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 5
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 6
 ; CHECK-NEXT:    v_mov_b32_e32 v7, 14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> <float 16.0, float -32.0, float undef, float 9999.0>)
   ret { <4 x float>, <4 x i32> } %ret
@@ -137,9 +137,9 @@ define { <4 x float>, <4 x i32> } @frexp_nonsplat_vector() {
 define { float, i32 } @frexp_zero() {
 ; CHECK-LABEL: frexp_zero:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 0.0)
   ret { float, i32 } %ret
@@ -148,9 +148,9 @@ define { float, i32 } @frexp_zero() {
 define { float, i32 } @frexp_negzero() {
 ; CHECK-LABEL: frexp_negzero:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfrev_b32_e32 v0, 1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float -0.0)
   ret { float, i32 } %ret
@@ -159,9 +159,9 @@ define { float, i32 } @frexp_negzero() {
 define { float, i32 } @frexp_one() {
 ; CHECK-LABEL: frexp_one:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 1.0)
   ret { float, i32 } %ret
@@ -170,9 +170,9 @@ define { float, i32 } @frexp_one() {
 define { float, i32 } @frexp_negone() {
 ; CHECK-LABEL: frexp_negone:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, -0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float -1.0)
   ret { float, i32 } %ret
@@ -181,9 +181,9 @@ define { float, i32 } @frexp_negone() {
 define { float, i32 } @frexp_two() {
 ; CHECK-LABEL: frexp_two:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 2.0)
   ret { float, i32 } %ret
@@ -192,9 +192,9 @@ define { float, i32 } @frexp_two() {
 define { float, i32 } @frexp_negtwo() {
 ; CHECK-LABEL: frexp_negtwo:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, -0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float -2.0)
   ret { float, i32 } %ret
@@ -203,9 +203,9 @@ define { float, i32 } @frexp_negtwo() {
 define { float, i32 } @frexp_inf() {
 ; CHECK-LABEL: frexp_inf:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 0x7FF0000000000000)
   ret { float, i32 } %ret
@@ -214,9 +214,9 @@ define { float, i32 } @frexp_inf() {
 define { float, i32 } @frexp_neginf() {
 ; CHECK-LABEL: frexp_neginf:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 0xFFF0000000000000)
   ret { float, i32 } %ret
@@ -225,9 +225,9 @@ define { float, i32 } @frexp_neginf() {
 define { float, i32 } @frexp_qnan() {
 ; CHECK-LABEL: frexp_qnan:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float 0x7FF8000000000000)
   ret { float, i32 } %ret
@@ -236,9 +236,9 @@ define { float, i32 } @frexp_qnan() {
 define { float, i32 } @frexp_snan() {
 ; CHECK-LABEL: frexp_snan:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7fc00001
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float bitcast (i32 2139095041 to float))
   ret { float, i32 } %ret
@@ -247,9 +247,9 @@ define { float, i32 } @frexp_snan() {
 define { float, i32 } @frexp_pos_denorm() {
 ; CHECK-LABEL: frexp_pos_denorm:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x3f7ffffe
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xffffff82
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float bitcast (i32 8388607 to float))
   ret { float, i32 } %ret
@@ -258,9 +258,9 @@ define { float, i32 } @frexp_pos_denorm() {
 define { float, i32 } @frexp_neg_denorm() {
 ; CHECK-LABEL: frexp_neg_denorm:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0xbf7ffffe
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xffffff82
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { float, i32 } @llvm.frexp.f32.i32(float bitcast (i32 -2139095041 to float))
   ret { float, i32 } %ret
@@ -269,11 +269,11 @@ define { float, i32 } @frexp_neg_denorm() {
 define { <2 x float>, <2 x i32> } @frexp_splat_4() {
 ; CHECK-LABEL: frexp_splat_4:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0.5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 3
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 4.0, float 4.0>)
   ret { <2 x float>, <2 x i32> } %ret
@@ -282,11 +282,11 @@ define { <2 x float>, <2 x i32> } @frexp_splat_4() {
 define { <2 x float>, <2 x i32> } @frexp_splat_qnan() {
 ; CHECK-LABEL: frexp_splat_qnan:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>)
   ret { <2 x float>, <2 x i32> } %ret
@@ -295,11 +295,11 @@ define { <2 x float>, <2 x i32> } @frexp_splat_qnan() {
 define { <2 x float>, <2 x i32> } @frexp_splat_inf() {
 ; CHECK-LABEL: frexp_splat_inf:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x7f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000>)
   ret { <2 x float>, <2 x i32> } %ret
@@ -308,11 +308,11 @@ define { <2 x float>, <2 x i32> } @frexp_splat_inf() {
 define { <2 x float>, <2 x i32> } @frexp_splat_neginf() {
 ; CHECK-LABEL: frexp_splat_neginf:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0xff800000
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0xff800000
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000>)
   ret { <2 x float>, <2 x i32> } %ret
@@ -321,11 +321,11 @@ define { <2 x float>, <2 x i32> } @frexp_splat_neginf() {
 define { <2 x float>, <2 x i32> } @frexp_splat_undef_inf() {
 ; CHECK-LABEL: frexp_splat_undef_inf:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_frexp_mant_f32_e32 v0, s4
 ; CHECK-NEXT:    v_frexp_exp_i32_f32_e32 v2, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float undef, float 0x7FF0000000000000>)
   ret { <2 x float>, <2 x i32> } %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 72ca55f3ef3095b..42bfc9b83ee1fe0 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -603,8 +603,8 @@ entry:
 define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
 ; GFX89-LABEL: v_fshr_i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_i32:
@@ -614,14 +614,14 @@ define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
 ;
 ; GFX10-LABEL: v_fshr_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
   ret i32 %ret
@@ -630,9 +630,9 @@ define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
 define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
 ; GFX89-LABEL: v_fshr_v2i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i32:
@@ -642,16 +642,16 @@ define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2
 ;
 ; GFX10-LABEL: v_fshr_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
   ret <2 x i32> %ret
@@ -660,10 +660,10 @@ define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2
 define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
 ; GFX89-LABEL: v_fshr_v3i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v3i32:
@@ -673,18 +673,18 @@ define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2
 ;
 ; GFX10-LABEL: v_fshr_v3i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v3, v6
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v4, v7
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v5, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
   ret <3 x i32> %ret
@@ -693,11 +693,11 @@ define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2
 define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
 ; GFX89-LABEL: v_fshr_v4i32:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
 ; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v4i32:
@@ -707,20 +707,20 @@ define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2
 ;
 ; GFX10-LABEL: v_fshr_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v4, v8
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v5, v9
 ; GFX11-NEXT:    v_alignbit_b32 v2, v2, v6, v10
 ; GFX11-NEXT:    v_alignbit_b32 v3, v3, v7, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
   ret <4 x i32> %ret
@@ -729,30 +729,30 @@ define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2
 define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ; SI-LABEL: v_fshr_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_i16:
@@ -762,23 +762,23 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 ;
 ; GFX10-LABEL: v_fshr_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
   ret i16 %ret
@@ -787,7 +787,6 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
 define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
 ; SI-LABEL: v_fshr_v2i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v4, 16, v4
@@ -798,11 +797,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 1
 ; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -816,11 +815,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
@@ -828,6 +827,7 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i16:
@@ -837,7 +837,6 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ;
 ; GFX10-LABEL: v_fshr_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -845,11 +844,11 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
@@ -859,6 +858,7 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
   ret <2 x i16> %ret
@@ -867,7 +867,6 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
 define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
 ; SI-LABEL: v_fshr_v3i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
@@ -882,11 +881,11 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v3i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; VI-NEXT:    v_mov_b32_e32 v8, 1
 ; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -905,11 +904,11 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 1
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -929,6 +928,7 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v3i16:
@@ -938,7 +938,6 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ;
 ; GFX10-LABEL: v_fshr_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
@@ -958,11 +957,11 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX10-NEXT:    v_lshlrev_b16 v1, v4, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
@@ -985,6 +984,7 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 ; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
   ret <3 x i16> %ret
@@ -993,7 +993,6 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
 define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
 ; SI-LABEL: v_fshr_v4i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
@@ -1014,11 +1013,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; VI-NEXT:    v_mov_b32_e32 v8, 1
 ; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1044,11 +1043,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 1
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1075,6 +1074,7 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v4i16:
@@ -1084,7 +1084,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ;
 ; GFX10-LABEL: v_fshr_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -1113,11 +1112,11 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -1148,6 +1147,7 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
   ret <4 x i16> %ret
@@ -1156,35 +1156,35 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
 define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; SI-LABEL: v_fshr_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
 ; SI-NEXT:    v_not_b32_e32 v4, v4
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; VI-NEXT:    v_not_b32_e32 v4, v4
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX9-NEXT:    v_not_b32_e32 v4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_i64:
@@ -1194,18 +1194,17 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ;
 ; GFX10-LABEL: v_fshr_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_not_b32_e32 v5, v4
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX11-NEXT:    v_not_b32_e32 v5, v4
 ; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
@@ -1214,6 +1213,7 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
   ret i64 %ret
@@ -1222,7 +1222,6 @@ define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
 define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
 ; SI-LABEL: v_fshr_v2i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
 ; SI-NEXT:    v_not_b32_e32 v8, v8
@@ -1235,11 +1234,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v4
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; VI-NEXT:    v_not_b32_e32 v8, v8
@@ -1252,11 +1251,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; VI-NEXT:    v_or_b32_e32 v2, v2, v5
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
 ; GFX9-NEXT:    v_not_b32_e32 v8, v8
@@ -1269,6 +1268,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i64:
@@ -1278,7 +1278,6 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ;
 ; GFX10-LABEL: v_fshr_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_not_b32_e32 v9, v8
@@ -1291,11 +1290,11 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_not_b32_e32 v9, v8
@@ -1311,6 +1310,7 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
   ret <2 x i64> %ret
@@ -1319,7 +1319,6 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2
 define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; SI-LABEL: v_fshr_i24:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; SI-NEXT:    v_mul_hi_u32 v3, v3, s4
@@ -1328,11 +1327,11 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_i24:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; VI-NEXT:    v_mul_hi_u32 v3, v3, s4
@@ -1341,11 +1340,11 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
 ; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v3, s4
@@ -1354,6 +1353,7 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_i24:
@@ -1363,7 +1363,6 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ;
 ; GFX10-LABEL: v_fshr_i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaab, v3
@@ -1371,11 +1370,11 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1386,6 +1385,7 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 8, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
   ret i24 %ret
@@ -1394,7 +1394,6 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
 ; SI-LABEL: v_fshr_v2i24:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
 ; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
@@ -1410,11 +1409,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fshr_v2i24:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
 ; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
@@ -1430,11 +1429,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
 ; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_fshr_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
@@ -1450,6 +1449,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
 ; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fshr_v2i24:
@@ -1459,7 +1459,6 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ;
 ; GFX10-LABEL: v_fshr_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
@@ -1474,11 +1473,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
@@ -1498,6 +1497,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
   ret <2 x i24> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 046f26246969589..cc892102f429773 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -8,7 +8,6 @@
 define float @v_sqrt_f32(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -27,11 +26,11 @@ define float @v_sqrt_f32(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -50,11 +49,11 @@ define float @v_sqrt_f32(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -72,11 +71,11 @@ define float @v_sqrt_f32(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -94,6 +93,7 @@ define float @v_sqrt_f32(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -102,7 +102,6 @@ define float @v_sqrt_f32(float %x) {
 define float @v_sqrt_f32_fneg(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_fneg:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -121,11 +120,11 @@ define float @v_sqrt_f32_fneg(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_fneg:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e64 v2, -v0, v2
@@ -145,11 +144,11 @@ define float @v_sqrt_f32_fneg(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fneg:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -167,11 +166,11 @@ define float @v_sqrt_f32_fneg(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fneg:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e64 v2, -v0, v2
@@ -190,6 +189,7 @@ define float @v_sqrt_f32_fneg(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.neg = fneg float %x
   %result = call float @llvm.sqrt.f32(float %x.neg)
@@ -199,7 +199,6 @@ define float @v_sqrt_f32_fneg(float %x) {
 define float @v_sqrt_f32_fabs(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_fabs:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    s_mov_b32 s5, 0x4f800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e64 v1, |v0|, s5
@@ -219,11 +218,11 @@ define float @v_sqrt_f32_fabs(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_fabs:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e64 v2, |v0|, v2
@@ -243,11 +242,11 @@ define float @v_sqrt_f32_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fabs:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    s_mov_b32 s5, 0x4f800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e64 v1, |v0|, s5
@@ -266,11 +265,11 @@ define float @v_sqrt_f32_fabs(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fabs:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e64 v2, |v0|, v2
@@ -289,6 +288,7 @@ define float @v_sqrt_f32_fabs(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %result = call float @llvm.sqrt.f32(float %x.fabs)
@@ -298,7 +298,6 @@ define float @v_sqrt_f32_fabs(float %x) {
 define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_fneg_fabs:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SDAG-IEEE-NEXT:    s_mov_b32 s5, 0xcf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e64 v1, |v0|, s5
@@ -318,11 +317,11 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_fneg_fabs:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e64 v2, -|v0|, v2
@@ -342,11 +341,11 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SDAG-DAZ-NEXT:    s_mov_b32 s5, 0xcf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e64 v1, |v0|, s5
@@ -365,11 +364,11 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_fneg_fabs:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e64 v2, -|v0|, v2
@@ -388,6 +387,7 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %x.fabs.neg = fneg float %x.fabs
@@ -398,7 +398,6 @@ define float @v_sqrt_f32_fneg_fabs(float %x) {
 define float @v_sqrt_f32_ninf(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ninf:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -417,11 +416,11 @@ define float @v_sqrt_f32_ninf(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ninf:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -440,11 +439,11 @@ define float @v_sqrt_f32_ninf(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ninf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -462,11 +461,11 @@ define float @v_sqrt_f32_ninf(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ninf:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -484,6 +483,7 @@ define float @v_sqrt_f32_ninf(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -492,7 +492,6 @@ define float @v_sqrt_f32_ninf(float %x) {
 define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_no_infs_attribute:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -511,11 +510,11 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_no_infs_attribute:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -534,11 +533,11 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -556,11 +555,11 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_no_infs_attribute:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -578,6 +577,7 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -586,7 +586,6 @@ define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
 define float @v_sqrt_f32_nnan(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nnan:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -605,11 +604,11 @@ define float @v_sqrt_f32_nnan(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nnan:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -628,11 +627,11 @@ define float @v_sqrt_f32_nnan(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nnan:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -650,11 +649,11 @@ define float @v_sqrt_f32_nnan(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nnan:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -672,6 +671,7 @@ define float @v_sqrt_f32_nnan(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -912,7 +912,6 @@ define amdgpu_ps i32 @s_sqrt_f32_afn_nnan_ninf(float inreg %x) {
 define float @v_sqrt_f32_nsz(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nsz:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -931,11 +930,11 @@ define float @v_sqrt_f32_nsz(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nsz:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -954,11 +953,11 @@ define float @v_sqrt_f32_nsz(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nsz:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -976,11 +975,11 @@ define float @v_sqrt_f32_nsz(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nsz:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -998,6 +997,7 @@ define float @v_sqrt_f32_nsz(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1006,7 +1006,6 @@ define float @v_sqrt_f32_nsz(float %x) {
 define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nnan_ninf:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1025,11 +1024,11 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nnan_ninf:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1048,11 +1047,11 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1070,11 +1069,11 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nnan_ninf:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1092,6 +1091,7 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1100,7 +1100,6 @@ define float @v_sqrt_f32_nnan_ninf(float %x) {
 define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nnan_ninf_nsz:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1119,11 +1118,11 @@ define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nnan_ninf_nsz:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1142,11 +1141,11 @@ define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_nnan_ninf_nsz:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1164,11 +1163,11 @@ define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_nnan_ninf_nsz:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1186,6 +1185,7 @@ define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1194,8 +1194,8 @@ define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
 define float @v_sqrt_f32_afn(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1204,8 +1204,8 @@ define float @v_sqrt_f32_afn(float %x) {
 define float @v_sqrt_f32_afn_nsz(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_nsz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1214,9 +1214,9 @@ define float @v_sqrt_f32_afn_nsz(float %x) {
 define <2 x float> @v_sqrt_v2f32_afn(<2 x float> %x) {
 ; GCN-LABEL: v_sqrt_v2f32_afn:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
   ret <2 x float> %result
@@ -1225,8 +1225,8 @@ define <2 x float> @v_sqrt_v2f32_afn(<2 x float> %x) {
 define float @v_sqrt_f32_afn_nnan(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_nnan:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1235,8 +1235,8 @@ define float @v_sqrt_f32_afn_nnan(float %x) {
 define float @v_sqrt_f32_fabs_afn_ninf(float %x) {
 ; GCN-LABEL: v_sqrt_f32_fabs_afn_ninf:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call float @llvm.fabs.f32(float %x)
   %result = call afn ninf float @llvm.sqrt.f32(float %fabs)
@@ -1246,8 +1246,8 @@ define float @v_sqrt_f32_fabs_afn_ninf(float %x) {
 define float @v_sqrt_f32_afn_nnan_ninf(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1256,9 +1256,9 @@ define float @v_sqrt_f32_afn_nnan_ninf(float %x) {
 define <2 x float> @v_sqrt_v2f32_afn_nnan_ninf(<2 x float> %x) {
 ; GCN-LABEL: v_sqrt_v2f32_afn_nnan_ninf:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
   ret <2 x float> %result
@@ -1267,8 +1267,8 @@ define <2 x float> @v_sqrt_v2f32_afn_nnan_ninf(<2 x float> %x) {
 define float @v_sqrt_f32_afn_nnan_ninf_nsz(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf_nsz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1277,8 +1277,8 @@ define float @v_sqrt_f32_afn_nnan_ninf_nsz(float %x) {
 define float @v_sqrt_f32__approx_func_fp_math(float %x) #2 {
 ; GCN-LABEL: v_sqrt_f32__approx_func_fp_math:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1287,8 +1287,8 @@ define float @v_sqrt_f32__approx_func_fp_math(float %x) #2 {
 define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 {
 ; GCN-LABEL: v_sqrt_f32__enough_unsafe_attrs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1297,8 +1297,8 @@ define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 {
 define float @v_sqrt_f32__unsafe_attr(float %x) #4 {
 ; GCN-LABEL: v_sqrt_f32__unsafe_attr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -1307,7 +1307,6 @@ define float @v_sqrt_f32__unsafe_attr(float %x) #4 {
 define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
@@ -1342,11 +1341,11 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v3
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
@@ -1381,11 +1380,11 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_v2f32:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1418,11 +1417,11 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v1, v4
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_v2f32:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
@@ -1455,6 +1454,7 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v1, v4
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
   ret <2 x float> %result
@@ -1463,7 +1463,6 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v3f32:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
@@ -1514,11 +1513,11 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v4
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v3f32:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v0
@@ -1569,11 +1568,11 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_v3f32:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s6, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
@@ -1621,11 +1620,11 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v2, v5
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_v3f32:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v0
@@ -1673,6 +1672,7 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %x)
   ret <3 x float> %result
@@ -1682,7 +1682,6 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 define float @v_sqrt_f32_ulp05(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp05:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1701,11 +1700,11 @@ define float @v_sqrt_f32_ulp05(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp05:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1724,11 +1723,11 @@ define float @v_sqrt_f32_ulp05(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ulp05:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1746,11 +1745,11 @@ define float @v_sqrt_f32_ulp05(float %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ulp05:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1768,6 +1767,7 @@ define float @v_sqrt_f32_ulp05(float %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !0
   ret float %result
@@ -1777,7 +1777,6 @@ define float @v_sqrt_f32_ulp05(float %x) {
 define float @v_sqrt_f32_ulp1(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp1:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1796,11 +1795,11 @@ define float @v_sqrt_f32_ulp1(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp1:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -1819,12 +1818,13 @@ define float @v_sqrt_f32_ulp1(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp1:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !1
   ret float %result
@@ -1834,7 +1834,6 @@ define float @v_sqrt_f32_ulp1(float %x) {
 define float @v_sqrt_f32_ulp2(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1843,11 +1842,11 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -1855,12 +1854,13 @@ define float @v_sqrt_f32_ulp2(float %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -1870,7 +1870,6 @@ define float @v_sqrt_f32_ulp2(float %x) {
 define float @v_sqrt_f32_ulp25(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp25:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1879,11 +1878,11 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp25:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -1891,12 +1890,13 @@ define float @v_sqrt_f32_ulp25(float %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp25:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !3
   ret float %result
@@ -1906,7 +1906,6 @@ define float @v_sqrt_f32_ulp25(float %x) {
 define float @v_sqrt_f32_ulp3(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp3:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1915,11 +1914,11 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp3:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -1927,12 +1926,13 @@ define float @v_sqrt_f32_ulp3(float %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp3:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !4
   ret float %result
@@ -1941,7 +1941,6 @@ define float @v_sqrt_f32_ulp3(float %x) {
 define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_fabs:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
@@ -1950,11 +1949,11 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_fabs:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, s[4:5]
@@ -1962,12 +1961,13 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_fabs:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %result = call float @llvm.sqrt.f32(float %x.fabs), !fpmath !2
@@ -1977,7 +1977,6 @@ define float @v_sqrt_f32_ulp2_fabs(float %x) {
 define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp1:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
@@ -2012,11 +2011,11 @@ define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v3
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
@@ -2051,13 +2050,14 @@ define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp1:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
   ret <2 x float> %result
@@ -2067,7 +2067,6 @@ define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2083,11 +2082,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
@@ -2101,13 +2100,14 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
   ret <2 x float> %result
@@ -2116,7 +2116,6 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp1_fabs:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SDAG-IEEE-NEXT:    s_mov_b32 s7, 0x4f800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e64 v2, |v0|, s7
@@ -2152,11 +2151,11 @@ define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v3
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1_fabs:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e64 v4, |v0|, v3
@@ -2192,13 +2191,14 @@ define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp1_fabs:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e64 v0, |v0|
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e64 v1, |v1|
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
   %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !1
@@ -2209,7 +2209,6 @@ define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_fabs:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s6, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -2225,11 +2224,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[6:7]
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_fabs:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
@@ -2243,13 +2242,14 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[6:7]
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_fabs:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e64 v0, |v0|
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e64 v1, |v1|
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
   %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !2
@@ -2260,8 +2260,8 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 define float @v_sqrt_f32_afn_ulp1(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_ulp1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
   ret float %result
@@ -2271,8 +2271,8 @@ define float @v_sqrt_f32_afn_ulp1(float %x) {
 define float @v_sqrt_f32_afn_ulp2(float %x) {
 ; GCN-LABEL: v_sqrt_f32_afn_ulp2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -2281,9 +2281,9 @@ define float @v_sqrt_f32_afn_ulp2(float %x) {
 define <2 x float> @v_sqrt_v2f32_afn_ulp1(<2 x float> %x) {
 ; GCN-LABEL: v_sqrt_v2f32_afn_ulp1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
   ret <2 x float> %result
@@ -2293,9 +2293,9 @@ define <2 x float> @v_sqrt_v2f32_afn_ulp1(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_afn_ulp2(<2 x float> %x) {
 ; GCN-LABEL: v_sqrt_v2f32_afn_ulp2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
   ret <2 x float> %result
@@ -2304,7 +2304,6 @@ define <2 x float> @v_sqrt_v2f32_afn_ulp2(<2 x float> %x) {
 define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -2321,11 +2320,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; SDAG-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -2341,13 +2340,14 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
   %result = fdiv float 1.0, %sqrt, !fpmath !3
@@ -2357,7 +2357,6 @@ define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
 define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4b800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2367,11 +2366,11 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2381,12 +2380,13 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
   %result = fdiv contract float 1.0, %sqrt, !fpmath !3
@@ -2396,7 +2396,6 @@ define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
 define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2418,11 +2417,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
@@ -2443,11 +2442,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x2f800000
@@ -2457,11 +2456,11 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x6f800000
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x2f800000
@@ -2471,6 +2470,7 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
   %result = fdiv float %y, %sqrt, !fpmath !3
@@ -2480,7 +2480,6 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2502,11 +2501,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
@@ -2527,11 +2526,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x2f800000
@@ -2541,11 +2540,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x6f800000
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x2f800000
@@ -2555,6 +2554,7 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
   %result = fdiv contract float %y, %sqrt, !fpmath !3
@@ -2564,7 +2564,6 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2582,11 +2581,11 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
@@ -2603,14 +2602,15 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
   %result = fdiv arcp contract float %y, %sqrt, !fpmath !3
@@ -2620,7 +2620,6 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
 define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -2651,11 +2650,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
@@ -2684,15 +2683,16 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
   %result = fdiv <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
@@ -2702,7 +2702,6 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2718,11 +2717,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4b800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
@@ -2738,13 +2737,14 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
   %result = fdiv contract <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
@@ -2754,7 +2754,6 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
 define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x float> %y) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -2795,11 +2794,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
@@ -2838,11 +2837,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
@@ -2859,11 +2858,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v5, v0
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x6f800000
@@ -2880,6 +2879,7 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v6, v0
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
   %result = fdiv contract <2 x float> %y, %sqrt, !fpmath !3
@@ -2889,7 +2889,6 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2 x float> %y) {
 ; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
@@ -2922,11 +2921,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; SDAG-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
@@ -2957,17 +2956,18 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
   %result = fdiv arcp contract <2 x float> %y, %sqrt, !fpmath !3
@@ -3013,7 +3013,6 @@ define amdgpu_ps i32 @s_sqrt_f32_ulp3(float inreg %x) {
 define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3022,11 +3021,11 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3034,12 +3033,13 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3048,7 +3048,6 @@ define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x)
 define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3057,11 +3056,11 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3069,12 +3068,13 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3083,7 +3083,6 @@ define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub)
 define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_negdenormal:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3092,11 +3091,11 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_negdenormal:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3104,12 +3103,13 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_known_never_negdenormal:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3118,8 +3118,8 @@ define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
 define float @v_sqrt_f32_known_never_denormal(float nofpclass(sub) %x) {
 ; GCN-LABEL: v_sqrt_f32_known_never_denormal:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3128,7 +3128,6 @@ define float @v_sqrt_f32_known_never_denormal(float nofpclass(sub) %x) {
 define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_ninf_known_never_zero:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3147,11 +3146,11 @@ define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_ninf_known_never_zero:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3170,11 +3169,11 @@ define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_ninf_known_never_zero:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3192,11 +3191,11 @@ define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_ninf_known_never_zero:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3214,6 +3213,7 @@ define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -3222,7 +3222,6 @@ define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
 define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3241,11 +3240,11 @@ define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3264,11 +3263,11 @@ define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3286,11 +3285,11 @@ define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3308,6 +3307,7 @@ define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -3316,7 +3316,6 @@ define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
 define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_inf:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3335,11 +3334,11 @@ define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_inf:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3358,11 +3357,11 @@ define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_inf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3380,11 +3379,11 @@ define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_inf:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3402,6 +3401,7 @@ define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -3410,7 +3410,6 @@ define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x
 define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3429,11 +3428,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf)
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3452,11 +3451,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3474,11 +3473,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf)
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3496,6 +3495,7 @@ define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -3504,7 +3504,6 @@ define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf)
 define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3523,11 +3522,11 @@ define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf)
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3546,11 +3545,11 @@ define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3568,11 +3567,11 @@ define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf)
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3590,6 +3589,7 @@ define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x)
   ret float %result
@@ -3598,7 +3598,6 @@ define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf)
 define float @v_sqrt_f32_frexp_src(float %x) {
 ; SDAG-LABEL: v_sqrt_f32_frexp_src:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -3620,11 +3619,11 @@ define float @v_sqrt_f32_frexp_src(float %x) {
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f32_frexp_src:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
@@ -3646,6 +3645,7 @@ define float @v_sqrt_f32_frexp_src(float %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x260
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
   %frexp.mant = extractvalue { float, i32 } %frexp, 0
@@ -3656,22 +3656,22 @@ define float @v_sqrt_f32_frexp_src(float %x) {
 define float @v_sqrt_f32_ulp3_frexp_src(float %x) {
 ; SDAG-LABEL: v_sqrt_f32_ulp3_frexp_src:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f32_ulp3_frexp_src:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
   %frexp.mant = extractvalue { float, i32 } %frexp, 0
@@ -3682,7 +3682,6 @@ define float @v_sqrt_f32_ulp3_frexp_src(float %x) {
 define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero ninf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3691,11 +3690,11 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3703,12 +3702,13 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3717,7 +3717,6 @@ define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero n
 define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_known_never_ninf_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3726,11 +3725,11 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_known_never_ninf_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3738,12 +3737,13 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_known_never_ninf_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3752,7 +3752,6 @@ define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3761,11 +3760,11 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3773,12 +3772,13 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
   ret float %result
@@ -3787,7 +3787,6 @@ define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
 define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; SDAG-IEEE-LABEL: v_elim_redun_check_ult_sqrt:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3806,11 +3805,11 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_elim_redun_check_ult_sqrt:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3833,11 +3832,11 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_elim_redun_check_ult_sqrt:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3855,11 +3854,11 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
 ; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_elim_redun_check_ult_sqrt:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0xf800000
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v0
@@ -3881,6 +3880,7 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GISEL-DAZ-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
@@ -3891,7 +3891,6 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; SDAG-IEEE-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
 ; SDAG-IEEE:       ; %bb.0:
-; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -3904,11 +3903,11 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
 ; GISEL-IEEE:       ; %bb.0:
-; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -3920,26 +3919,27 @@ define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
 ; GISEL-IEEE-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
 ; SDAG-DAZ:       ; %bb.0:
-; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v1, v0
 ; SDAG-DAZ-NEXT:    s_brev_b32 s4, 1
 ; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; SDAG-DAZ-NEXT:    v_cmp_le_f32_e32 vcc, s4, v0
 ; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
 ; GISEL-DAZ:       ; %bb.0:
-; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v1, v0
 ; GISEL-DAZ-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; GISEL-DAZ-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %in), !fpmath !4
   %cmp = fcmp ult float %in, -0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 196a3705ac81876..ae5699db8cc9e21 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -8,7 +8,6 @@
 define double @v_sqrt_f64(double %x) {
 ; SDAG-LABEL: v_sqrt_f64:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -32,11 +31,11 @@ define double @v_sqrt_f64(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -60,6 +59,7 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -68,7 +68,6 @@ define double @v_sqrt_f64(double %x) {
 define double @v_sqrt_f64_fneg(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_fneg:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 9
 ; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
@@ -92,11 +91,11 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_fneg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
@@ -120,6 +119,7 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.neg = fneg double %x
   %result = call double @llvm.sqrt.f64(double %x.neg)
@@ -129,7 +129,6 @@ define double @v_sqrt_f64_fneg(double %x) {
 define double @v_sqrt_f64_fabs(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_fabs:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -153,11 +152,11 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_fabs:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
@@ -181,6 +180,7 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %result = call double @llvm.sqrt.f64(double %x.fabs)
@@ -190,7 +190,6 @@ define double @v_sqrt_f64_fabs(double %x) {
 define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 9
 ; SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -214,11 +213,11 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
@@ -242,6 +241,7 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %x.fabs.neg = fneg double %x.fabs
@@ -252,7 +252,6 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 define double @v_sqrt_f64_ninf(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_ninf:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -276,11 +275,11 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -304,6 +303,7 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -312,7 +312,6 @@ define double @v_sqrt_f64_ninf(double %x) {
 define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
 ; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -336,11 +335,11 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -364,6 +363,7 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call ninf double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -372,7 +372,6 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 define double @v_sqrt_f64_nnan(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_nnan:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -396,11 +395,11 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_nnan:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -424,6 +423,7 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -708,7 +708,6 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 define double @v_sqrt_f64_nsz(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -732,11 +731,11 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -760,6 +759,7 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -768,7 +768,6 @@ define double @v_sqrt_f64_nsz(double %x) {
 define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -792,11 +791,11 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -820,6 +819,7 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -828,7 +828,6 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -852,11 +851,11 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -880,6 +879,7 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -888,7 +888,6 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 define double @v_sqrt_f64_afn(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_afn:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -912,11 +911,11 @@ define double @v_sqrt_f64_afn(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -940,6 +939,7 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -948,7 +948,6 @@ define double @v_sqrt_f64_afn(double %x) {
 define double @v_sqrt_f64_afn_nsz(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_afn_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -972,11 +971,11 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_afn_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1000,6 +999,7 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1008,7 +1008,6 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; SDAG-LABEL: v_sqrt_v2f64_afn:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1051,11 +1050,11 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_v2f64_afn:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -1099,6 +1098,7 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
@@ -1107,7 +1107,6 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 define double @v_sqrt_f64_afn_nnan(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_afn_nnan:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1131,11 +1130,11 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1159,6 +1158,7 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1167,7 +1167,6 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -1191,11 +1190,11 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
@@ -1219,6 +1218,7 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call double @llvm.fabs.f64(double %x)
   %result = call afn ninf double @llvm.sqrt.f64(double %fabs)
@@ -1228,7 +1228,6 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1252,11 +1251,11 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1280,6 +1279,7 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1288,7 +1288,6 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; SDAG-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1331,11 +1330,11 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_v2f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -1379,6 +1378,7 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
@@ -1387,7 +1387,6 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1411,11 +1410,11 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1439,6 +1438,7 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1447,7 +1447,6 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1471,11 +1470,11 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1499,6 +1498,7 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1507,7 +1507,6 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1531,11 +1530,11 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1559,6 +1558,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1567,7 +1567,6 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1591,11 +1590,11 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1619,6 +1618,7 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call nsz double @llvm.sqrt.f64(double %x)
   ret double %result
@@ -1627,7 +1627,6 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; SDAG-LABEL: v_sqrt_v2f64:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0
 ; SDAG-NEXT:    s_brev_b32 s5, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1670,11 +1669,11 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_v2f64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -1718,6 +1717,7 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   ret <2 x double> %result
@@ -1726,7 +1726,6 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; SDAG-LABEL: v_sqrt_v3f64:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s6, 0
 ; SDAG-NEXT:    s_brev_b32 s7, 8
 ; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
@@ -1788,11 +1787,11 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sqrt_v3f64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
 ; GISEL-NEXT:    v_mov_b32_e32 v6, s4
@@ -1854,6 +1853,7 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s[6:7]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x)
   ret <3 x double> %result
diff --git a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
index 0b8f4722f507d36..63e08ab5f0b6bcd 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub-as-fneg-src-modifier.ll
@@ -8,9 +8,9 @@
 define float @no_fold_f32_fsub_into_fneg_modifier_ieee_pos1(float %v0, float %v1) #0 {
 ; CHECK-LABEL: no_fold_f32_fsub_into_fneg_modifier_ieee_pos1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v0, 1.0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float 1.0, %v0
   %mul = fmul float %sub, %v1
@@ -20,9 +20,9 @@ define float @no_fold_f32_fsub_into_fneg_modifier_ieee_pos1(float %v0, float %v1
 define float @no_fold_f32_fsub_into_fneg_modifier_daz_pos1(float %v0, float %v1) #1 {
 ; CHECK-LABEL: no_fold_f32_fsub_into_fneg_modifier_daz_pos1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v0, 1.0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float 1.0, %v0
   %mul = fmul float %sub, %v1
@@ -32,16 +32,16 @@ define float @no_fold_f32_fsub_into_fneg_modifier_daz_pos1(float %v0, float %v1)
 define float @no_fold_f32_fsub_into_fneg_modifier_ieee_commuted(float %v0, float %v1) #0 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_ieee_commuted:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_add_f32_e32 v0, 0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_ieee_commuted:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_subrev_f32_e32 v0, 0x80000000, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float %v0, -0.0
   %mul = fmul float %sub, %v1
@@ -51,15 +51,15 @@ define float @no_fold_f32_fsub_into_fneg_modifier_ieee_commuted(float %v0, float
 define float @fold_f32_fsub_into_fneg_modifier_ieee_pos0(float %v0, float %v1) #0 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_pos0:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_sub_f32_e32 v0, 0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float 0.0, %v0
   %mul = fmul float %sub, %v1
@@ -69,15 +69,15 @@ define float @fold_f32_fsub_into_fneg_modifier_ieee_pos0(float %v0, float %v1) #
 define float @fold_f32_fsub_into_fneg_modifier_daz_pos0(float %v0, float %v1) #1 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_pos0:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_sub_f32_e32 v0, 0, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float 0.0, %v0
   %mul = fmul float %sub, %v1
@@ -87,16 +87,16 @@ define float @fold_f32_fsub_into_fneg_modifier_daz_pos0(float %v0, float %v1) #1
 define float @no_fold_f32_fsub_into_fneg_modifier_daz_commuted(float %v0, float %v1) #1 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_daz_commuted:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_add_f32_e32 v0, 0, v0
 ; SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_daz_commuted:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_subrev_f32_e32 v0, 0x80000000, v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float %v0, -0.0
   %mul = fmul float %sub, %v1
@@ -106,15 +106,15 @@ define float @no_fold_f32_fsub_into_fneg_modifier_daz_commuted(float %v0, float
 define float @fold_f32_fsub_into_fneg_modifier_ieee(float %v0, float %v1) #0 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = fmul float %sub, %v1
@@ -124,15 +124,15 @@ define float @fold_f32_fsub_into_fneg_modifier_ieee(float %v0, float %v1) #0 {
 define float @fold_f32_fsub_into_fneg_modifier_daz(float %v0, float %v1) #1 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = fmul float %sub, %v1
@@ -142,15 +142,15 @@ define float @fold_f32_fsub_into_fneg_modifier_daz(float %v0, float %v1) #1 {
 define float @fold_f32_fsub_into_fneg_modifier_ieee_nsz(float %v0, float %v1) #0 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_ieee_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz float -0.0, %v0
   %mul = fmul nsz float %sub, %v1
@@ -160,15 +160,15 @@ define float @fold_f32_fsub_into_fneg_modifier_ieee_nsz(float %v0, float %v1) #0
 define float @fold_f32_fsub_into_fneg_modifier_daz_nsz(float %v0, float %v1) #1 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_daz_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz float -0.0, %v0
   %mul = fmul nsz float %sub, %v1
@@ -178,15 +178,15 @@ define float @fold_f32_fsub_into_fneg_modifier_daz_nsz(float %v0, float %v1) #1
 define float @fold_f32_fsub_into_fneg_modifier_dynamic(float %v0, float %v1) #2 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = fmul float %sub, %v1
@@ -196,15 +196,15 @@ define float @fold_f32_fsub_into_fneg_modifier_dynamic(float %v0, float %v1) #2
 define float @fold_f32_fsub_into_fneg_modifier_dynamic_nsz(float %v0, float %v1) #2 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_dynamic_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz float -0.0, %v0
   %mul = fmul nsz float %sub, %v1
@@ -214,18 +214,18 @@ define float @fold_f32_fsub_into_fneg_modifier_dynamic_nsz(float %v0, float %v1)
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_ieee(<2 x float> %v0, <2 x float> %v1) #0 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul <2 x float> %sub, %v1
@@ -235,18 +235,18 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_ieee(<2 x float> %v0, <2
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_daz(<2 x float> %v0, <2 x float> %v1) #1 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul <2 x float> %sub, %v1
@@ -256,18 +256,18 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_daz(<2 x float> %v0, <2 x
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_ieee_nsz(<2 x float> %v0, <2 x float> %v1) #0 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_ieee_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_ieee_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul nsz <2 x float> %sub, %v1
@@ -277,18 +277,18 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_ieee_nsz(<2 x float> %v0,
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_daz_nsz(<2 x float> %v0, <2 x float> %v1) #1 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_daz_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul nsz <2 x float> %sub, %v1
@@ -298,18 +298,18 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_daz_nsz(<2 x float> %v0,
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_dynamic(<2 x float> %v0, <2 x float> %v1) #2 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul <2 x float> %sub, %v1
@@ -319,18 +319,18 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_dynamic(<2 x float> %v0,
 define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz(<2 x float> %v0, <2 x float> %v1) #2 {
 ; SDAG-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; SDAG-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x float> <float -0.0, float -0.0>, %v0
   %mul = fmul nsz <2 x float> %sub, %v1
@@ -341,15 +341,15 @@ define <2 x float> @fold_v2f32_fsub_into_fneg_modifier_dynamic_nsz(<2 x float> %
 define half @fold_f16_fsub_into_fneg_modifier_ieee(half %v0, half %v1) #0 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = fmul half %sub, %v1
@@ -359,15 +359,15 @@ define half @fold_f16_fsub_into_fneg_modifier_ieee(half %v0, half %v1) #0 {
 define half @fold_f16_fsub_into_fneg_modifier_daz(half %v0, half %v1) #1 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = fmul half %sub, %v1
@@ -377,15 +377,15 @@ define half @fold_f16_fsub_into_fneg_modifier_daz(half %v0, half %v1) #1 {
 define half @fold_f16_fsub_into_fneg_modifier_ieee_nsz(half %v0, half %v1) #0 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_ieee_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_ieee_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz half -0.0, %v0
   %mul = fmul nsz half %sub, %v1
@@ -395,15 +395,15 @@ define half @fold_f16_fsub_into_fneg_modifier_ieee_nsz(half %v0, half %v1) #0 {
 define half @fold_f16_fsub_into_fneg_modifier_daz_nsz(half %v0, half %v1) #1 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_daz_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz half -0.0, %v0
   %mul = fmul nsz half %sub, %v1
@@ -413,15 +413,15 @@ define half @fold_f16_fsub_into_fneg_modifier_daz_nsz(half %v0, half %v1) #1 {
 define half @fold_f16_fsub_into_fneg_modifier_dynamic(half %v0, half %v1) #2 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = fmul half %sub, %v1
@@ -431,15 +431,15 @@ define half @fold_f16_fsub_into_fneg_modifier_dynamic(half %v0, half %v1) #2 {
 define half @fold_f16_fsub_into_fneg_modifier_dynamic_nsz(half %v0, half %v1) #2 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f16_e64 v0, -v0, v1
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_dynamic_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz half -0.0, %v0
   %mul = fmul nsz half %sub, %v1
@@ -449,15 +449,15 @@ define half @fold_f16_fsub_into_fneg_modifier_dynamic_nsz(half %v0, half %v1) #2
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_ieee(<2 x half> %v0, <2 x half> %v1) #0 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul <2 x half> %sub, %v1
@@ -467,15 +467,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_ieee(<2 x half> %v0, <2 x
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_daz(<2 x half> %v0, <2 x half> %v1) #1 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul <2 x half> %sub, %v1
@@ -485,15 +485,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_daz(<2 x half> %v0, <2 x h
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_ieee_nsz(<2 x half> %v0, <2 x half> %v1) #0 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_ieee_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_ieee_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul nsz <2 x half> %sub, %v1
@@ -503,15 +503,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_ieee_nsz(<2 x half> %v0, <
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_daz_nsz(<2 x half> %v0, <2 x half> %v1) #1 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_daz_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_daz_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul nsz <2 x half> %sub, %v1
@@ -521,15 +521,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_daz_nsz(<2 x half> %v0, <2
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_dynamic(<2 x half> %v0, <2 x half> %v1) #2 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul <2 x half> %sub, %v1
@@ -539,15 +539,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_dynamic(<2 x half> %v0, <2
 define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_dynamic_nsz(<2 x half> %v0, <2 x half> %v1) #2 {
 ; SDAG-LABEL: fold_v2f16_fsub_into_fneg_modifier_dynamic_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_fsub_into_fneg_modifier_dynamic_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz <2 x half> <half -0.0, half -0.0>, %v0
   %mul = fmul nsz <2 x half> %sub, %v1
@@ -557,15 +557,15 @@ define <2 x half> @fold_v2f16_fsub_into_fneg_modifier_dynamic_nsz(<2 x half> %v0
 define double @fold_f64_fsub_into_fneg_modifier_ieee(double %v0, double %v1) #0 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = fmul double %sub, %v1
@@ -575,15 +575,15 @@ define double @fold_f64_fsub_into_fneg_modifier_ieee(double %v0, double %v1) #0
 define double @fold_f64_fsub_into_fneg_modifier_daz(double %v0, double %v1) #1 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = fmul double %sub, %v1
@@ -593,15 +593,15 @@ define double @fold_f64_fsub_into_fneg_modifier_daz(double %v0, double %v1) #1 {
 define double @fold_f64_fsub_into_fneg_modifier_ieee_nsz(double %v0, double %v1) #0 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_ieee_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_ieee_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz double -0.0, %v0
   %mul = fmul nsz double %sub, %v1
@@ -611,15 +611,15 @@ define double @fold_f64_fsub_into_fneg_modifier_ieee_nsz(double %v0, double %v1)
 define double @fold_f64_fsub_into_fneg_modifier_daz_nsz(double %v0, double %v1) #1 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_daz_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_daz_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz double -0.0, %v0
   %mul = fmul nsz double %sub, %v1
@@ -629,15 +629,15 @@ define double @fold_f64_fsub_into_fneg_modifier_daz_nsz(double %v0, double %v1)
 define double @fold_f64_fsub_into_fneg_modifier_dynamic(double %v0, double %v1) #2 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = fmul double %sub, %v1
@@ -647,15 +647,15 @@ define double @fold_f64_fsub_into_fneg_modifier_dynamic(double %v0, double %v1)
 define double @fold_f64_fsub_into_fneg_modifier_dynamic_nsz(double %v0, double %v1) #2 {
 ; SDAG-LABEL: fold_f64_fsub_into_fneg_modifier_dynamic_nsz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_fsub_into_fneg_modifier_dynamic_nsz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[0:1], -v[0:1], v[2:3]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub nsz double -0.0, %v0
   %mul = fmul nsz double %sub, %v1
@@ -665,19 +665,19 @@ define double @fold_f64_fsub_into_fneg_modifier_dynamic_nsz(double %v0, double %
 define float @fold_f32_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, float %v0, float %v1) #0 {
 ; SDAG-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -687,20 +687,20 @@ define float @fold_f32_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, float
 define float @no_fold_f32_select_user_fsub_into_fneg_modifier_daz(i1 %cond, float %v0, float %v1) #1 {
 ; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -710,20 +710,20 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_daz(i1 %cond, floa
 define float @no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, float %v0, float %v1) #2 {
 ; SDAG-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f32_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %mul = select i1 %cond, float %sub, float %v1
@@ -733,20 +733,20 @@ define float @no_fold_f32_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
 define half @fold_f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, half %v0, half %v1) #0 {
 ; SDAG-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -756,20 +756,20 @@ define half @fold_f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, half %v
 define half @no_fold_f16_select_user_fsub_into_fneg_modifier_daz(i1 %cond, half %v0, half %v1) #1 {
 ; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_sub_f16_e32 v1, 0x8000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -779,20 +779,20 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_daz(i1 %cond, half
 define half @no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, half %v0, half %v1) #2 {
 ; SDAG-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_sub_f16_e32 v1, 0x8000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_max_f16_e64 v1, -v1, -v1
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %mul = select i1 %cond, half %sub, half %v1
@@ -802,22 +802,22 @@ define half @no_fold_f16_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, h
 define double @fold_f64_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, double %v0, double %v1) #0 {
 ; SDAG-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f64_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -827,22 +827,22 @@ define double @fold_f64_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, doubl
 define double @no_fold_f64_select_user_fsub_into_fneg_modifier_daz(i1 %cond, double %v0, double %v1) #1 {
 ; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -852,22 +852,22 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_daz(i1 %cond, dou
 define double @no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, double %v0, double %v1) #2 {
 ; SDAG-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[1:2], -v[1:2], -v[1:2]
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %mul = select i1 %cond, double %sub, double %v1
@@ -877,20 +877,20 @@ define double @no_fold_f64_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond,
 define <2 x half> @fold_v2f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond, <2 x half> %v0, <2 x half> %v1) #0 {
 ; SDAG-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_v2f16_select_user_fsub_into_fneg_modifier_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -900,20 +900,20 @@ define <2 x half> @fold_v2f16_select_user_fsub_into_fneg_modifier_ieee(i1 %cond,
 define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz(i1 %cond, <2 x half> %v0, <2 x half> %v1) #1 {
 ; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -923,20 +923,20 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_daz(i1 %con
 define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic(i1 %cond, <2 x half> %v0, <2 x half> %v1) #2 {
 ; SDAG-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1]
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub <2 x half> <half -0.0, half -0.0>, %v0
   %mul = select i1 %cond, <2 x half> %sub, <2 x half> %v1
@@ -946,9 +946,9 @@ define <2 x half> @no_fold_v2f16_select_user_fsub_into_fneg_modifier_dynamic(i1
 define float @fold_f32_strict_fsub_into_fneg_modifier_ieee(float %v0, float %v1) #0 {
 ; CHECK-LABEL: fold_f32_strict_fsub_into_fneg_modifier_ieee:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sub = call float @llvm.experimental.constrained.fsub.f32(float -0.0, float %v0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   %mul = call float @llvm.experimental.constrained.fmul.f32(float %sub, float %v1, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -958,9 +958,9 @@ define float @fold_f32_strict_fsub_into_fneg_modifier_ieee(float %v0, float %v1)
 define float @fold_f32_strict_fsub_into_fneg_modifier_daz(float %v0, float %v1) #1 {
 ; CHECK-LABEL: fold_f32_strict_fsub_into_fneg_modifier_daz:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sub = call float @llvm.experimental.constrained.fsub.f32(float -0.0, float %v0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   %mul = call float @llvm.experimental.constrained.fmul.f32(float %sub, float %v1, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -970,9 +970,9 @@ define float @fold_f32_strict_fsub_into_fneg_modifier_daz(float %v0, float %v1)
 define float @fold_f32_strict_fsub_into_fneg_modifier_dynamic(float %v0, float %v1) #2 {
 ; CHECK-LABEL: fold_f32_strict_fsub_into_fneg_modifier_dynamic:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sub = call float @llvm.experimental.constrained.fsub.f32(float -0.0, float %v0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   %mul = call float @llvm.experimental.constrained.fmul.f32(float %sub, float %v1, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -982,17 +982,17 @@ define float @fold_f32_strict_fsub_into_fneg_modifier_dynamic(float %v0, float %
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1002,18 +1002,18 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_ieee(float %v0) #0 {
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1023,18 +1023,18 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_daz(float %v0) #1 {
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #2 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 1)
@@ -1044,19 +1044,19 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_issnan_dynamic(float %v0) #
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0) #0 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1066,20 +1066,20 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_ieee(float %v0)
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #1 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
 ; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1089,20 +1089,20 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_daz(float %v0) #
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v0) #2 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
 ; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f32(float %sub, i32 144)
@@ -1112,17 +1112,17 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_isdenormal_dynamic(float %v
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_ieee(float %v0, i32 %testmask) #0 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_ieee:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_ieee:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1132,18 +1132,18 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_ieee(float %v0, i32 %te
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_daz(float %v0, i32 %testmask) #1 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1153,18 +1153,18 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_daz(float %v0, i32 %tes
 define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic(float %v0, i32 %testmask) #2 {
 ; SDAG-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f32(float %sub, i32 %testmask)
@@ -1174,17 +1174,17 @@ define i1 @no_fold_f32_fsub_into_fneg_modifier_class_var_dynamic(float %v0, i32
 define i1 @no_fold_f64_fsub_into_fneg_modifier_class_var_daz(double %v0, i32 %testmask) #1 {
 ; SDAG-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_var_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f64(double %sub, i32 %testmask)
@@ -1194,18 +1194,18 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_var_daz(double %v0, i32 %te
 define i1 @no_fold_f16_fsub_into_fneg_modifier_class_var_daz(half %v0, i32 %testmask) #1 {
 ; SDAG-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_var_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
 ; SDAG-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_var_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %class = call i1 @llvm.amdgcn.class.f16(half %sub, i32 %testmask)
@@ -1215,19 +1215,19 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_var_daz(half %v0, i32 %test
 define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
 ; SDAG-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0x90
 ; SDAG-NEXT:    v_cmp_class_f64_e64 s[4:5], -v[0:1], v2
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f64_fsub_into_fneg_modifier_class_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x90
 ; GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub double -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f64(double %sub, i32 144)
@@ -1237,20 +1237,20 @@ define i1 @no_fold_f64_fsub_into_fneg_modifier_class_daz(double %v0) #1 {
 define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
 ; SDAG-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v1, 0x90
 ; SDAG-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: no_fold_f16_fsub_into_fneg_modifier_class_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e64 v0, -v0, -v0
 ; GISEL-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GISEL-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub half -0.0, %v0
   %class = call i1 @llvm.is.fpclass.f16(half %sub, i32 144)
@@ -1260,20 +1260,20 @@ define i1 @no_fold_f16_fsub_into_fneg_modifier_class_daz(half %v0) #1 {
 define amdgpu_gfx float @fold_f32_fsub_into_fneg_modifier_interp_daz(float %v0, i32 inreg %v1) #1 {
 ; SDAG-LABEL: fold_f32_fsub_into_fneg_modifier_interp_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
 ; SDAG-NEXT:    s_mov_b32 m0, s4
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_interp_p1_f32_e32 v0, v0, attr0.x
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f32_fsub_into_fneg_modifier_interp_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    s_mov_b32 m0, s4
 ; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_interp_p1_f32_e32 v0, v0, attr0.x
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %p0_0 = call float @llvm.amdgcn.interp.p1(float %sub, i32 0, i32 0, i32 %v1)
@@ -1283,19 +1283,19 @@ define amdgpu_gfx float @fold_f32_fsub_into_fneg_modifier_interp_daz(float %v0,
 define amdgpu_gfx float @fold_f16_fsub_into_fneg_modifier_interp_daz(float %v0, i32 inreg %m0) #1 {
 ; SDAG-LABEL: fold_f16_fsub_into_fneg_modifier_interp_daz:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 m0, s4
 ; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
 ; SDAG-NEXT:    v_interp_p1ll_f16 v0, -v0, attr2.y
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: fold_f16_fsub_into_fneg_modifier_interp_daz:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GISEL-NEXT:    s_mov_b32 m0, s4
 ; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3
 ; GISEL-NEXT:    v_interp_p1ll_f16 v0, v0, attr2.y
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sub = fsub float -0.0, %v0
   %p1_0 = call float @llvm.amdgcn.interp.p1.f16(float %sub, i32 1, i32 2, i1 0, i32 %m0)
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 14d98c848119719..cf8f0173cc6e9a8 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -5,20 +5,19 @@
 define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i1_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i1_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s0, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -27,17 +26,16 @@ define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
 define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i8_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i8_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i8 %arg0, ptr addrspace(1) undef
   ret void
@@ -46,17 +44,16 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
 define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_short v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i16 %arg0, ptr addrspace(1) undef
   ret void
@@ -65,17 +62,16 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
 define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i32 %arg0, ptr addrspace(1) undef
   ret void
@@ -84,18 +80,17 @@ define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
 define void @void_func_i64_inreg(i64 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i64 %arg0, ptr addrspace(1) undef
   ret void
@@ -104,17 +99,16 @@ define void @void_func_i64_inreg(i64 inreg %arg0) #0 {
 define void @void_func_f16_inreg(half inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_short v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store half %arg0, ptr addrspace(1) undef
   ret void
@@ -123,17 +117,16 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 {
 define void @void_func_f32_inreg(float inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store float %arg0, ptr addrspace(1) undef
   ret void
@@ -142,18 +135,17 @@ define void @void_func_f32_inreg(float inreg %arg0) #0 {
 define void @void_func_f64_inreg(double inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store double %arg0, ptr addrspace(1) undef
   ret void
@@ -162,17 +154,16 @@ define void @void_func_f64_inreg(double inreg %arg0) #0 {
 define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -181,21 +172,20 @@ define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
 define void @void_func_v3i16_inreg(<3 x i16> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    global_store_short v[0:1], v0, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -213,18 +203,17 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 {
 ;
 ; GFX9-LABEL: void_func_v4i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -233,23 +222,22 @@ define void @void_func_v4i16_inreg(<4 x i16> inreg %arg0) #0 {
 define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v5i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    global_store_short v[0:1], v0, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -258,21 +246,20 @@ define void @void_func_v5i16_inreg(<5 x i16> inreg %arg0) #0 {
 define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -281,18 +268,17 @@ define void @void_func_v8i16_inreg(<8 x i16> inreg %arg0) #0 {
 define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -301,20 +287,19 @@ define void @void_func_v2i32_inreg(<2 x i32> inreg %arg0) #0 {
 define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -323,21 +308,20 @@ define void @void_func_v3i32_inreg(<3 x i32> inreg %arg0) #0 {
 define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v4i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -346,7 +330,6 @@ define void @void_func_v4i32_inreg(<4 x i32> inreg %arg0) #0 {
 define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v5i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -354,18 +337,18 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -374,7 +357,6 @@ define void @void_func_v5i32_inreg(<5 x i32> inreg %arg0) #0 {
 define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -386,12 +368,11 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -399,6 +380,7 @@ define void @void_func_v8i32_inreg(<8 x i32> inreg %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -436,7 +418,6 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
 ;
 ; GFX9-LABEL: void_func_v16i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
@@ -460,12 +441,11 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
@@ -479,6 +459,7 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -487,7 +468,6 @@ define void @void_func_v16i32_inreg(<16 x i32> inreg %arg0) #0 {
 define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v32i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
@@ -530,12 +510,11 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
@@ -562,6 +541,7 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <32 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -570,21 +550,20 @@ define void @void_func_v32i32_inreg(<32 x i32> inreg %arg0) #0 {
 define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -593,7 +572,6 @@ define void @void_func_v2i64_inreg(<2 x i64> inreg %arg0) #0 {
 define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
@@ -602,18 +580,18 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -622,7 +600,6 @@ define void @void_func_v3i64_inreg(<3 x i64> inreg %arg0) #0 {
 define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v4i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -634,12 +611,11 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -647,6 +623,7 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -655,7 +632,6 @@ define void @void_func_v4i64_inreg(<4 x i64> inreg %arg0) #0 {
 define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v5i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -671,12 +647,11 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -686,6 +661,7 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[8:9], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -694,7 +670,6 @@ define void @void_func_v5i64_inreg(<5 x i64> inreg %arg0) #0 {
 define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
@@ -718,12 +693,11 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
@@ -737,6 +711,7 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -745,7 +720,6 @@ define void @void_func_v8i64_inreg(<8 x i64> inreg %arg0) #0 {
 define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v16i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
@@ -788,12 +762,11 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
@@ -820,6 +793,7 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -828,17 +802,16 @@ define void @void_func_v16i64_inreg(<16 x i64> inreg %arg0) #0 {
 define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -847,21 +820,20 @@ define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
 define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    global_store_short v[0:1], v0, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -870,18 +842,17 @@ define void @void_func_v3f16_inreg(<3 x half> inreg %arg0) #0 {
 define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v4f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -890,21 +861,20 @@ define void @void_func_v4f16_inreg(<4 x half> inreg %arg0) #0 {
 define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -913,7 +883,6 @@ define void @void_func_v8f16_inreg(<8 x half> inreg %arg0) #0 {
 define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v16f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -925,12 +894,11 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -938,6 +906,7 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -946,18 +915,17 @@ define void @void_func_v16f16_inreg(<16 x half> inreg %arg0) #0 {
 define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -966,20 +934,19 @@ define void @void_func_v2f32_inreg(<2 x float> inreg %arg0) #0 {
 define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    global_store_dwordx3 v[0:1], v[0:2], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -988,21 +955,20 @@ define void @void_func_v3f32_inreg(<3 x float> inreg %arg0) #0 {
 define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v4f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1011,7 +977,6 @@ define void @void_func_v4f32_inreg(<4 x float> inreg %arg0) #0 {
 define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -1023,12 +988,11 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1036,6 +1000,7 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1044,7 +1009,6 @@ define void @void_func_v8f32_inreg(<8 x float> inreg %arg0) #0 {
 define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v16f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
@@ -1068,12 +1032,11 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
@@ -1087,6 +1050,7 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1095,21 +1059,20 @@ define void @void_func_v16f32_inreg(<16 x float> inreg %arg0) #0 {
 define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v2f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -1118,7 +1081,6 @@ define void @void_func_v2f64_inreg(<2 x double> inreg %arg0) #0 {
 define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v3f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
@@ -1127,18 +1089,18 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -1147,7 +1109,6 @@ define void @void_func_v3f64_inreg(<3 x double> inreg %arg0) #0 {
 define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v4f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s10
@@ -1159,12 +1120,11 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
@@ -1172,6 +1132,7 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -1180,7 +1141,6 @@ define void @void_func_v4f64_inreg(<4 x double> inreg %arg0) #0 {
 define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v8f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s18
@@ -1204,12 +1164,11 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
@@ -1223,6 +1182,7 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -1231,7 +1191,6 @@ define void @void_func_v8f64_inreg(<8 x double> inreg %arg0) #0 {
 define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_v16f64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
@@ -1274,12 +1233,11 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_mov_b32 v5, s25
@@ -1306,6 +1264,7 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[12:15], off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[16:19], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -1314,7 +1273,6 @@ define void @void_func_v16f64_inreg(<16 x double> inreg %arg0) #0 {
 define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inreg %arg1, i8 inreg %arg2, i16 inreg %arg3, half inreg %arg4) #0 {
 ; GFX9-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s28
@@ -1367,12 +1325,11 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr
 ; GFX9-NEXT:    global_store_short v[0:1], v8, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_short v[0:1], v9, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
 ; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
@@ -1414,6 +1371,7 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v5, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -1426,7 +1384,6 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr
 define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32> inreg %arg1, <2 x float> inreg %arg2) #0 {
 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v13, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s28
@@ -1474,12 +1431,11 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
 ; GFX11-NEXT:    v_dual_mov_b32 v10, s24 :: v_dual_mov_b32 v11, s25
@@ -1516,6 +1472,7 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[4:5], off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i32> %arg1, ptr addrspace(1) undef
@@ -1526,7 +1483,6 @@ define void @void_func_v32i32_v2i32_v2f32_inreg(<32 x i32> inreg %arg0, <2 x i32
 define void @too_many_args_use_workitem_id_x_inreg(
 ; GFX9-LABEL: too_many_args_use_workitem_id_x_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -1616,12 +1572,11 @@ define void @too_many_args_use_workitem_id_x_inreg(
 ; GFX9-NEXT:    global_store_dword v[0:1], v4, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[0:1], v5, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: too_many_args_use_workitem_id_x_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s4
@@ -1704,6 +1659,7 @@ define void @too_many_args_use_workitem_id_x_inreg(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   i32 inreg %arg0, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7,
   i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15,
@@ -1754,23 +1710,22 @@ define void @too_many_args_use_workitem_id_x_inreg(
 define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg1) #0 {
 ; GFX9-LABEL: void_func_i32_v2float_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i32_v2float_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i32 %arg0, ptr addrspace(1) undef
   store <2 x float> %arg1, ptr addrspace(1) undef
@@ -1780,9 +1735,9 @@ define void @void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg
 define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inreg %arg1) #0 {
 ; GFX9-LABEL: caller_void_func_i32_v2float_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
@@ -1812,9 +1767,9 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ;
 ; GFX11-LABEL: caller_void_func_i32_v2float_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s4
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 01dcc26566663be..db314ed2d68ad41 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -7,21 +7,20 @@
 define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i1 %arg0, ptr addrspace(1) undef
   ret void
@@ -30,21 +29,20 @@ define void @void_func_i1(i1 %arg0) #0 {
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1_zeroext:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    v_or_b32_e32 v0, 12, v0
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i1 %arg0 to i32
   %add = add i32 %ext, 12
@@ -55,41 +53,38 @@ define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
 ; CI-LABEL: void_func_i1_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_i1_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_i1_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i1_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i1 %arg0 to i32
   %add = add i32 %ext, 12
@@ -100,10 +95,10 @@ define void @void_func_i1_signext(i1 signext %arg0) #0 {
 define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-LABEL: i1_arg_i1_use:
 ; CIGFX89:       ; %bb.0: ; %bb
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CIGFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CIGFX89-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT:    s_waitcnt expcnt(0)
 ; CIGFX89-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
 ; CIGFX89-NEXT:    s_cbranch_execz .LBB3_2
 ; CIGFX89-NEXT:  ; %bb.1: ; %bb1
@@ -114,17 +109,17 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:  .LBB3_2: ; %bb2
 ; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_arg_i1_use:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
 ; GFX11-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb1
@@ -134,6 +129,7 @@ define void @i1_arg_i1_use(i1 %arg) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:  .LBB3_2: ; %bb2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   br i1 %arg, label %bb2, label %bb1
@@ -149,19 +145,18 @@ bb2:
 define void @void_func_i8(i8 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i8:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i8 %arg0, ptr addrspace(1) undef
   ret void
@@ -170,41 +165,38 @@ define void @void_func_i8(i8 %arg0) #0 {
 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
 ; CI-LABEL: void_func_i8_zeroext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_i8_zeroext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_i8_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i8_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i8 %arg0 to i32
   %add = add i32 %ext, 12
@@ -215,41 +207,38 @@ define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
 define void @void_func_i8_signext(i8 signext %arg0) #0 {
 ; CI-LABEL: void_func_i8_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_i8_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_i8_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i8_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i8 %arg0 to i32
   %add = add i32 %ext, 12
@@ -260,19 +249,18 @@ define void @void_func_i8_signext(i8 signext %arg0) #0 {
 define void @void_func_i16(i16 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i16:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i16 %arg0, ptr addrspace(1) undef
   ret void
@@ -281,41 +269,38 @@ define void @void_func_i16(i16 %arg0) #0 {
 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
 ; CI-LABEL: void_func_i16_zeroext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_i16_zeroext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i16_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i16 %arg0 to i32
   %add = add i32 %ext, 12
@@ -326,41 +311,38 @@ define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
 define void @void_func_i16_signext(i16 signext %arg0) #0 {
 ; CI-LABEL: void_func_i16_signext:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_i16_signext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_i16_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i16_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i16 %arg0 to i32
   %add = add i32 %ext, 12
@@ -371,19 +353,18 @@ define void @void_func_i16_signext(i16 signext %arg0) #0 {
 define void @void_func_i32(i32 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i32 %arg0, ptr addrspace(1) undef
   ret void
@@ -392,19 +373,18 @@ define void @void_func_i32(i32 %arg0) #0 {
 define void @void_func_i64(i64 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i64 %arg0, ptr addrspace(1) undef
   ret void
@@ -413,29 +393,27 @@ define void @void_func_i64(i64 %arg0) #0 {
 define void @void_func_f16(half %arg0) #0 {
 ; CI-LABEL: void_func_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store half %arg0, ptr addrspace(1) undef
   ret void
@@ -444,19 +422,18 @@ define void @void_func_f16(half %arg0) #0 {
 define void @void_func_f32(float %arg0) #0 {
 ; CIGFX89-LABEL: void_func_f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store float %arg0, ptr addrspace(1) undef
   ret void
@@ -465,19 +442,18 @@ define void @void_func_f32(float %arg0) #0 {
 define void @void_func_f64(double %arg0) #0 {
 ; CIGFX89-LABEL: void_func_f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store double %arg0, ptr addrspace(1) undef
   ret void
@@ -486,19 +462,18 @@ define void @void_func_f64(double %arg0) #0 {
 define void @void_func_v2i32(<2 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v2i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -507,19 +482,18 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 {
 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v3i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -528,19 +502,18 @@ define void @void_func_v3i32(<3 x i32> %arg0) #0 {
 define void @void_func_v4i32(<4 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v4i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -549,22 +522,21 @@ define void @void_func_v4i32(<4 x i32> %arg0) #0 {
 define void @void_func_v5i32(<5 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v5i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v4, off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -573,22 +545,21 @@ define void @void_func_v5i32(<5 x i32> %arg0) #0 {
 define void @void_func_v8i32(<8 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v8i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -597,19 +568,17 @@ define void @void_func_v8i32(<8 x i32> %arg0) #0 {
 define void @void_func_v16i32(<16 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v16i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x3
@@ -617,6 +586,7 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -625,7 +595,6 @@ define void @void_func_v16i32(<16 x i32> %arg0) #0 {
 define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v32i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -638,12 +607,11 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -658,6 +626,7 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <32 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -667,7 +636,6 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 {
 define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; CI-LABEL: void_func_v33i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -683,12 +651,11 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(5)
 ; CI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v33i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -704,12 +671,11 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v33i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
@@ -727,12 +693,11 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v33i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -751,6 +716,7 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <33 x i32> %arg0, ptr addrspace(1) undef
   ret void
@@ -759,19 +725,18 @@ define void @void_func_v33i32(<33 x i32> %arg0) #0 {
 define void @void_func_v2i64(<2 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v2i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -780,22 +745,21 @@ define void @void_func_v2i64(<2 x i64> %arg0) #0 {
 define void @void_func_v3i64(<3 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v3i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -804,22 +768,21 @@ define void @void_func_v3i64(<3 x i64> %arg0) #0 {
 define void @void_func_v4i64(<4 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v4i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -828,24 +791,23 @@ define void @void_func_v4i64(<4 x i64> %arg0) #0 {
 define void @void_func_v5i64(<5 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v5i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -854,19 +816,17 @@ define void @void_func_v5i64(<5 x i64> %arg0) #0 {
 define void @void_func_v8i64(<8 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v8i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x3
@@ -874,6 +834,7 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -882,7 +843,6 @@ define void @void_func_v8i64(<8 x i64> %arg0) #0 {
 define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v16i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -895,12 +855,11 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -915,6 +874,7 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i64> %arg0, ptr addrspace(1) undef
   ret void
@@ -923,7 +883,6 @@ define void @void_func_v16i64(<16 x i64> %arg0) #0 {
 define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v2i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; CI-NEXT:    s_mov_b32 s4, 0
@@ -932,12 +891,11 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s5, s4
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v2i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX89-NEXT:    s_mov_b32 s4, 0
 ; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -945,12 +903,11 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_mov_b32 s5, s4
 ; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
@@ -959,6 +916,7 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -967,31 +925,29 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 define void @void_func_v2i16(<2 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v2i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v2i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1000,7 +956,6 @@ define void @void_func_v2i16(<2 x i16> %arg0) #0 {
 define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v3i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; CI-NEXT:    s_mov_b32 s5, 0
@@ -1011,12 +966,11 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 ; CI-NEXT:    buffer_store_byte v2, off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s4, s5
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v3i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX89-NEXT:    s_mov_b32 s5, 0
 ; GFX89-NEXT:    s_mov_b32 s4, 2
@@ -1026,12 +980,11 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 ; GFX89-NEXT:    buffer_store_byte v2, off, s[4:7], 0
 ; GFX89-NEXT:    s_mov_b32 s4, s5
 ; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_mov_b32 s1, 0
@@ -1042,6 +995,7 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b8 v2, off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s0, s1
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1050,7 +1004,6 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v4i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -1065,12 +1018,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s5, s4
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v4i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
@@ -1081,12 +1033,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_mov_b32 s5, s4
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
@@ -1103,6 +1054,7 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1111,7 +1063,6 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v5i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -1128,12 +1079,11 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; CI-NEXT:    buffer_store_byte v4, off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s4, s5
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v5i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
@@ -1146,12 +1096,11 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; GFX89-NEXT:    buffer_store_byte v4, off, s[4:7], 0
 ; GFX89-NEXT:    s_mov_b32 s4, s5
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
@@ -1169,6 +1118,7 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1177,7 +1127,6 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v8i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; CI-NEXT:    v_and_b32_e32 v4, 0xff, v4
@@ -1201,12 +1150,11 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s5, s4
 ; CI-NEXT:    buffer_store_dwordx2 v[3:4], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v8i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX89-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1222,12 +1170,11 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_mov_b32 s5, s4
 ; GFX89-NEXT:    buffer_store_dwordx2 v[3:4], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
@@ -1252,6 +1199,7 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1260,7 +1208,6 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v16i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v14, 0xff, v14
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; CI-NEXT:    v_and_b32_e32 v12, 0xff, v12
@@ -1302,12 +1249,11 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s5, s4
 ; CI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v16i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
@@ -1333,12 +1279,11 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_mov_b32 s5, s4
 ; GFX89-NEXT:    buffer_store_dwordx4 v[9:12], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
@@ -1380,6 +1325,7 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1388,7 +1334,6 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; CI-LABEL: void_func_v32i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -1470,12 +1415,11 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s4, s5
 ; CI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v32i8:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX89-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
@@ -1525,12 +1469,11 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX89-NEXT:    s_mov_b32 s4, s5
 ; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_u8 v31, off, s32
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
@@ -1614,6 +1557,7 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    s_mov_b32 s0, s1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <32 x i8> %arg0, ptr addrspace(1) null
   ret void
@@ -1622,7 +1566,6 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 define void @void_func_v3i16(<3 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v3i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1630,27 +1573,26 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 {
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    buffer_store_short v2, off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v3i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_short v1, off, s[4:7], 0
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1659,7 +1601,6 @@ define void @void_func_v3i16(<3 x i16> %arg0) #0 {
 define void @void_func_v4i16(<4 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v4i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1669,24 +1610,23 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v4i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1695,7 +1635,6 @@ define void @void_func_v4i16(<4 x i16> %arg0) #0 {
 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v5i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -1706,27 +1645,26 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 {
 ; CI-NEXT:    v_or_b32_e32 v1, v0, v1
 ; CI-NEXT:    buffer_store_short v4, off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v5i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_short v2, off, s[4:7], 0
 ; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v5i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v2, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1735,7 +1673,6 @@ define void @void_func_v5i16(<5 x i16> %arg0) #0 {
 define void @void_func_v8i16(<8 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v8i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; CI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -1751,24 +1688,23 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v8i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1777,7 +1713,6 @@ define void @void_func_v8i16(<8 x i16> %arg0) #0 {
 define void @void_func_v16i16(<16 x i16> %arg0) #0 {
 ; CI-LABEL: void_func_v16i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; CI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -1806,27 +1741,26 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
 ; CI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; CI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v16i16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i16> %arg0, ptr addrspace(1) undef
   ret void
@@ -1835,43 +1769,39 @@ define void @void_func_v16i16(<16 x i16> %arg0) #0 {
 define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 ; CI-LABEL: void_func_v2i24:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v2i24:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    buffer_store_byte v1, off, s[4:7], 0
 ; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2i24:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1880,6 +1810,7 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %elt0 = extractelement <2 x i24> %arg0, i32 0
   %elt1 = extractelement <2 x i24> %arg0, i32 1
@@ -1891,19 +1822,18 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 define void @void_func_v2f32(<2 x float> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v2f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1912,19 +1842,18 @@ define void @void_func_v2f32(<2 x float> %arg0) #0 {
 define void @void_func_v3f32(<3 x float> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v3f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1933,19 +1862,18 @@ define void @void_func_v3f32(<3 x float> %arg0) #0 {
 define void @void_func_v4f32(<4 x float> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v4f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1954,22 +1882,21 @@ define void @void_func_v4f32(<4 x float> %arg0) #0 {
 define void @void_func_v8f32(<8 x float> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v8f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -1978,19 +1905,17 @@ define void @void_func_v8f32(<8 x float> %arg0) #0 {
 define void @void_func_v16f32(<16 x float> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v16f32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x3
@@ -1998,6 +1923,7 @@ define void @void_func_v16f32(<16 x float> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x float> %arg0, ptr addrspace(1) undef
   ret void
@@ -2006,19 +1932,18 @@ define void @void_func_v16f32(<16 x float> %arg0) #0 {
 define void @void_func_v2f64(<2 x double> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v2f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -2027,22 +1952,21 @@ define void @void_func_v2f64(<2 x double> %arg0) #0 {
 define void @void_func_v3f64(<3 x double> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v3f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -2051,22 +1975,21 @@ define void @void_func_v3f64(<3 x double> %arg0) #0 {
 define void @void_func_v4f64(<4 x double> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v4f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -2075,19 +1998,17 @@ define void @void_func_v4f64(<4 x double> %arg0) #0 {
 define void @void_func_v8f64(<8 x double> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v8f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x3
@@ -2095,6 +2016,7 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -2103,7 +2025,6 @@ define void @void_func_v8f64(<8 x double> %arg0) #0 {
 define void @void_func_v16f64(<16 x double> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_v16f64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
@@ -2116,12 +2037,11 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -2136,6 +2056,7 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
 ; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x double> %arg0, ptr addrspace(1) undef
   ret void
@@ -2144,7 +2065,6 @@ define void @void_func_v16f64(<16 x double> %arg0) #0 {
 define void @void_func_v2f16(<2 x half> %arg0) #0 {
 ; CI-LABEL: void_func_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2152,24 +2072,23 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v2f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -2179,7 +2098,6 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 {
 define void @void_func_v3f16(<3 x half> %arg0) #0 {
 ; CI-LABEL: void_func_v3f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2189,27 +2107,26 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 {
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CI-NEXT:    buffer_store_short v2, off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v3f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_short v1, off, s[4:7], 0
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -2218,7 +2135,6 @@ define void @void_func_v3f16(<3 x half> %arg0) #0 {
 define void @void_func_v4f16(<4 x half> %arg0) #0 {
 ; CI-LABEL: void_func_v4f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v1
@@ -2230,24 +2146,23 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v4f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -2256,7 +2171,6 @@ define void @void_func_v4f16(<4 x half> %arg0) #0 {
 define void @void_func_v8f16(<8 x half> %arg0) #0 {
 ; CI-LABEL: void_func_v8f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v5
@@ -2276,24 +2190,23 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v8f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v8f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -2302,7 +2215,6 @@ define void @void_func_v8f16(<8 x half> %arg0) #0 {
 define void @void_func_v16f16(<16 x half> %arg0) #0 {
 ; CI-LABEL: void_func_v16f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -2339,27 +2251,26 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[4:7], 0
 ; CI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: void_func_v16f16:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v16f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x half> %arg0, ptr addrspace(1) undef
   ret void
@@ -2369,7 +2280,6 @@ define void @void_func_v16f16(<16 x half> %arg0) #0 {
 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 ; CIGFX89-LABEL: void_func_i32_i64_i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -2377,12 +2287,11 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    buffer_store_dword v3, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_i32_i64_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0 dlc
@@ -2391,6 +2300,7 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile i32 %arg0, ptr addrspace(1) undef
   store volatile i64 %arg1, ptr addrspace(1) undef
@@ -2401,19 +2311,18 @@ define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
 define void @void_func_struct_i32({ i32 } %arg0) #0 {
 ; CIGFX89-LABEL: void_func_struct_i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_struct_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store { i32 } %arg0, ptr addrspace(1) undef
   ret void
@@ -2422,22 +2331,21 @@ define void @void_func_struct_i32({ i32 } %arg0) #0 {
 define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
 ; CIGFX89-LABEL: void_func_struct_i8_i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store { i8, i32 } %arg0, ptr addrspace(1) undef
   ret void
@@ -2446,7 +2354,6 @@ define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
 define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
 ; CIGFX89-LABEL: void_func_byval_struct_i8_i32:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; CIGFX89-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
@@ -2455,12 +2362,11 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(1)
 ; CIGFX89-NEXT:    buffer_store_byte v1, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_byval_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_u8 v1, off, s32
@@ -2470,6 +2376,7 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
   store { i8, i32 } %arg0.load, ptr addrspace(1) undef
@@ -2479,7 +2386,6 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }) %arg0, ptr addrspace(5) byval({ i8, i32 }) %arg1, i32 %arg2) #0 {
 ; CI-LABEL: void_func_byval_struct_i8_i32_x2:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
@@ -2500,12 +2406,11 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; CI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    ds_write_b32 v0, v0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_byval_struct_i8_i32_x2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
@@ -2526,12 +2431,11 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; VI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    ds_write_b32 v0, v0
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_byval_struct_i8_i32_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
@@ -2551,12 +2455,11 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; GFX9-NEXT:    buffer_store_byte v3, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_byval_struct_i8_i32_x2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
@@ -2576,7 +2479,7 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; GFX11-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    ds_store_b32 v0, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
   %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
@@ -2589,7 +2492,6 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, ptr addrspace(5) byval(i64) %arg1) #0 {
 ; CIGFX89-LABEL: void_func_byval_i32_byval_i64:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    buffer_load_dword v2, off, s[0:3], s32
 ; CIGFX89-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12
@@ -2599,12 +2501,11 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt
 ; CIGFX89-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(1)
 ; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_byval_i32_byval_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s32
 ; GFX11-NEXT:    scratch_load_b64 v[0:1], off, s32 offset:8
@@ -2614,6 +2515,7 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt
 ; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load i32, ptr addrspace(5) %arg0
   %arg1.load = load i64, ptr addrspace(5) %arg1
@@ -2625,7 +2527,6 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt
 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_i32_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -2652,12 +2553,11 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; CI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_i32_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -2684,12 +2584,11 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_i32_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -2717,12 +2616,11 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i32_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:12
@@ -2753,6 +2651,7 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i32 %arg1, ptr addrspace(1) undef
@@ -2764,7 +2663,6 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
 ; CI-LABEL: void_func_v32i32_i1_i8_i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -2798,12 +2696,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_i1_i8_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -2836,12 +2733,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_i1_i8_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -2876,12 +2772,11 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_i1_i8_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
@@ -2920,6 +2815,7 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
@@ -2932,7 +2828,6 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v2i32_v2f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -2960,12 +2855,11 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v2i32_v2f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -2993,12 +2887,11 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i32_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3027,12 +2920,11 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i32_v2f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
@@ -3064,6 +2956,7 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b64 v[34:35], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i32> %arg1, ptr addrspace(1) undef
@@ -3074,7 +2967,6 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
 define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v2i16_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -3109,12 +3001,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v2i16_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -3140,12 +3031,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i16_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3173,12 +3063,11 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i16_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
@@ -3208,6 +3097,7 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
@@ -3218,7 +3108,6 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2
 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v2i64_v2f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -3250,12 +3139,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v2i64_v2f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -3287,12 +3175,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v2i64_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3326,12 +3213,11 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v2i64_v2f64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
@@ -3367,6 +3253,7 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i64> %arg1, ptr addrspace(1) undef
@@ -3377,7 +3264,6 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2
 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v4i32_v4f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -3409,12 +3295,11 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v4i32_v4f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -3446,12 +3331,11 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v4i32_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3485,12 +3369,11 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v4i32_v4f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:16
@@ -3526,6 +3409,7 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <4 x i32> %arg1, ptr addrspace(1) undef
@@ -3536,7 +3420,6 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4
 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v8i32_v8f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -3580,12 +3463,11 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v8i32_v8f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -3629,12 +3511,11 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v8i32_v8f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3682,12 +3563,11 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v8i32_v8f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x10
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:44
@@ -3737,6 +3617,7 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <8 x i32> %arg1, ptr addrspace(1) undef
@@ -3747,7 +3628,6 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8
 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
 ; CI-LABEL: void_func_v32i32_v16i32_v16f32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -3815,12 +3695,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v16i32_v16f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -3888,12 +3767,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v16i32_v16f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -3969,12 +3847,11 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x20
 ; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:76
@@ -4052,6 +3929,7 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <16 x i32> %arg1, ptr addrspace(1) undef
@@ -4063,44 +3941,40 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
 ; CI-LABEL: void_func_v3f32_wasted_reg:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write_b32 v0, v0
 ; CI-NEXT:    ds_write_b32 v0, v1
 ; CI-NEXT:    ds_write_b32 v0, v2
 ; CI-NEXT:    ds_write_b32 v0, v3
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v3f32_wasted_reg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_write_b32 v0, v0
 ; VI-NEXT:    ds_write_b32 v0, v1
 ; VI-NEXT:    ds_write_b32 v0, v2
 ; VI-NEXT:    ds_write_b32 v0, v3
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v3f32_wasted_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v0
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2
 ; GFX9-NEXT:    ds_write_b32 v0, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3f32_wasted_reg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_store_b32 v0, v0
 ; GFX11-NEXT:    ds_store_b32 v0, v1
 ; GFX11-NEXT:    ds_store_b32 v0, v2
 ; GFX11-NEXT:    ds_store_b32 v0, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.0 = extractelement <3 x float> %arg0, i32 0
   %arg0.1 = extractelement <3 x float> %arg0, i32 1
@@ -4115,44 +3989,40 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
 ; CI-LABEL: void_func_v3i32_wasted_reg:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write_b32 v0, v0
 ; CI-NEXT:    ds_write_b32 v0, v1
 ; CI-NEXT:    ds_write_b32 v0, v2
 ; CI-NEXT:    ds_write_b32 v0, v3
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v3i32_wasted_reg:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_write_b32 v0, v0
 ; VI-NEXT:    ds_write_b32 v0, v1
 ; VI-NEXT:    ds_write_b32 v0, v2
 ; VI-NEXT:    ds_write_b32 v0, v3
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v3i32_wasted_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b32 v0, v0
 ; GFX9-NEXT:    ds_write_b32 v0, v1
 ; GFX9-NEXT:    ds_write_b32 v0, v2
 ; GFX9-NEXT:    ds_write_b32 v0, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v3i32_wasted_reg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_store_b32 v0, v0
 ; GFX11-NEXT:    ds_store_b32 v0, v1
 ; GFX11-NEXT:    ds_store_b32 v0, v2
 ; GFX11-NEXT:    ds_store_b32 v0, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.0 = extractelement <3 x i32> %arg0, i32 0
   %arg0.1 = extractelement <3 x i32> %arg0, i32 1
@@ -4168,7 +4038,6 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
 define void @void_func_volatile_v16i8(<16 x i8> %arg0) #0 {
 ; CIGFX89-LABEL: void_func_volatile_v16i8:
 ; CIGFX89:       ; %bb.0:
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; CIGFX89-NEXT:    s_mov_b32 s6, -1
 ; CIGFX89-NEXT:    buffer_store_byte v15, off, s[4:7], 0
@@ -4202,12 +4071,11 @@ define void @void_func_volatile_v16i8(<16 x i8> %arg0) #0 {
 ; CIGFX89-NEXT:    buffer_store_byte v1, off, s[4:7], 0
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
-; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_volatile_v16i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v15, off, s[0:3], 0 dlc
@@ -4242,6 +4110,7 @@ define void @void_func_volatile_v16i8(<16 x i8> %arg0) #0 {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <16 x i8> %arg0, ptr addrspace(1) undef
   ret void
@@ -4251,7 +4120,6 @@ define void @void_func_volatile_v16i8(<16 x i8> %arg0) #0 {
 define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; CI-LABEL: void_func_v32i32_v16i8:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -4319,12 +4187,11 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; CI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: void_func_v32i32_v16i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
@@ -4392,12 +4259,11 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; VI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_v32i32_v16i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -4469,12 +4335,11 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX9-NEXT:    buffer_store_byte v5, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_byte v6, off, s[4:7], 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_v32i32_v16i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x10
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:64
@@ -4560,6 +4425,7 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <16 x i8> %arg1, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 9df4b903de4bda0..0e12d7e04bf2bd6 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -7,20 +7,18 @@
 define i1 @i1_func_void() #0 {
 ; GFX789-LABEL: i1_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -30,20 +28,18 @@ define i1 @i1_func_void() #0 {
 define zeroext i1 @i1_zeroext_func_void() #0 {
 ; GFX789-LABEL: i1_zeroext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_zeroext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -52,22 +48,22 @@ define zeroext i1 @i1_zeroext_func_void() #0 {
 define signext i1 @i1_signext_func_void() #0 {
 ; GFX789-LABEL: i1_signext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX789-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i1_signext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
@@ -76,20 +72,18 @@ define signext i1 @i1_signext_func_void() #0 {
 define i8 @i8_func_void() #0 {
 ; GFX789-LABEL: i8_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i8_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
@@ -98,20 +92,18 @@ define i8 @i8_func_void() #0 {
 define zeroext i8 @i8_zeroext_func_void() #0 {
 ; GFX789-LABEL: i8_zeroext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i8_zeroext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
@@ -120,20 +112,18 @@ define zeroext i8 @i8_zeroext_func_void() #0 {
 define signext i8 @i8_signext_func_void() #0 {
 ; GFX789-LABEL: i8_signext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i8_signext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_i8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
@@ -142,20 +132,18 @@ define signext i8 @i8_signext_func_void() #0 {
 define i16 @i16_func_void() #0 {
 ; GFX789-LABEL: i16_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
@@ -164,20 +152,18 @@ define i16 @i16_func_void() #0 {
 define zeroext i16 @i16_zeroext_func_void() #0 {
 ; GFX789-LABEL: i16_zeroext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_zeroext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
@@ -186,20 +172,18 @@ define zeroext i16 @i16_zeroext_func_void() #0 {
 define signext i16 @i16_signext_func_void() #0 {
 ; GFX789-LABEL: i16_signext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_sshort v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_signext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_i16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
@@ -208,20 +192,18 @@ define signext i16 @i16_signext_func_void() #0 {
 define i32 @i32_func_void() #0 {
 ; GFX789-LABEL: i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i32, ptr addrspace(1) undef
   ret i32 %val
@@ -230,23 +212,21 @@ define i32 @i32_func_void() #0 {
 define i48 @i48_func_void() #0 {
 ; GFX789-LABEL: i48_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i48_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
@@ -255,23 +235,21 @@ define i48 @i48_func_void() #0 {
 define zeroext i48 @i48_zeroext_func_void() #0 {
 ; GFX789-LABEL: i48_zeroext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i48_zeroext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
@@ -280,23 +258,21 @@ define zeroext i48 @i48_zeroext_func_void() #0 {
 define signext i48 @i48_signext_func_void() #0 {
 ; GFX789-LABEL: i48_signext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_sshort v1, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i48_signext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_i16 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
@@ -318,14 +294,14 @@ define i63 @i63_func_void(i63 %val) #0 {
 define zeroext i63 @i63_zeroext_func_void(i63 %val) #0 {
 ; GFX789-LABEL: i63_zeroext_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i63_zeroext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret i63 %val
 }
@@ -333,24 +309,24 @@ define zeroext i63 @i63_zeroext_func_void(i63 %val) #0 {
 define signext i63 @i63_signext_func_void(i63 %val) #0 {
 ; CI-LABEL: i63_signext_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; CI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: i63_signext_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX89-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i63_signext_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret i63 %val
 }
@@ -358,20 +334,18 @@ define signext i63 @i63_signext_func_void(i63 %val) #0 {
 define i64 @i64_func_void() #0 {
 ; GFX789-LABEL: i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i64, ptr addrspace(1) undef
   ret i64 %val
@@ -380,23 +354,21 @@ define i64 @i64_func_void() #0 {
 define i65 @i65_func_void() #0 {
 ; GFX789-LABEL: i65_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i65_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_u8 v2, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i65, ptr addrspace(1) undef
   ret i65 %val
@@ -405,20 +377,18 @@ define i65 @i65_func_void() #0 {
 define float @f32_func_void() #0 {
 ; GFX789-LABEL: f32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: f32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load float, ptr addrspace(1) undef
   ret float %val
@@ -427,20 +397,18 @@ define float @f32_func_void() #0 {
 define double @f64_func_void() #0 {
 ; GFX789-LABEL: f64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: f64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load double, ptr addrspace(1) undef
   ret double %val
@@ -449,20 +417,18 @@ define double @f64_func_void() #0 {
 define <2 x double> @v2f64_func_void() #0 {
 ; GFX789-LABEL: v2f64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2f64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x double>, ptr addrspace(1) undef
   ret <2 x double> %val
@@ -471,20 +437,18 @@ define <2 x double> @v2f64_func_void() #0 {
 define <2 x i32> @v2i32_func_void() #0 {
 ; GFX789-LABEL: v2i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i32>, ptr addrspace(1) undef
   ret <2 x i32> %val
@@ -493,20 +457,18 @@ define <2 x i32> @v2i32_func_void() #0 {
 define <3 x i32> @v3i32_func_void() #0 {
 ; GFX789-LABEL: v3i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b96 v[0:2], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i32>, ptr addrspace(1) undef
   ret <3 x i32> %val
@@ -515,20 +477,18 @@ define <3 x i32> @v3i32_func_void() #0 {
 define <4 x i32> @v4i32_func_void() #0 {
 ; GFX789-LABEL: v4i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v4i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i32>, ptr addrspace(1) undef
   ret <4 x i32> %val
@@ -537,24 +497,22 @@ define <4 x i32> @v4i32_func_void() #0 {
 define <5 x i32> @v5i32_func_void() #0 {
 ; GFX789-LABEL: v5i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dword v4, off, s[4:7], 0 glc
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v5i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b32 v4, off, s[0:3], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile <5 x i32>, ptr addrspace(1) undef
   ret <5 x i32> %val
@@ -563,19 +521,17 @@ define <5 x i32> @v5i32_func_void() #0 {
 define <8 x i32> @v8i32_func_void() #0 {
 ; GFX789-LABEL: v8i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v8i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -583,7 +539,7 @@ define <8 x i32> @v8i32_func_void() #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i32>, ptr addrspace(1) %ptr
@@ -593,7 +549,6 @@ define <8 x i32> @v8i32_func_void() #0 {
 define <16 x i32> @v16i32_func_void() #0 {
 ; GFX789-LABEL: v16i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -602,12 +557,11 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v16i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -617,7 +571,7 @@ define <16 x i32> @v16i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i32>, ptr addrspace(1) %ptr
@@ -627,7 +581,6 @@ define <16 x i32> @v16i32_func_void() #0 {
 define <32 x i32> @v32i32_func_void() #0 {
 ; GFX789-LABEL: v32i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -640,12 +593,11 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; GFX789-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX789-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v32i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -659,7 +611,7 @@ define <32 x i32> @v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
 ; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <32 x i32>, ptr addrspace(1) %ptr
@@ -669,20 +621,18 @@ define <32 x i32> @v32i32_func_void() #0 {
 define <2 x i64> @v2i64_func_void() #0 {
 ; GFX789-LABEL: v2i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i64>, ptr addrspace(1) undef
   ret <2 x i64> %val
@@ -691,19 +641,17 @@ define <2 x i64> @v2i64_func_void() #0 {
 define <3 x i64> @v3i64_func_void() #0 {
 ; GFX789-LABEL: v3i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -711,7 +659,7 @@ define <3 x i64> @v3i64_func_void() #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <3 x i64>, ptr addrspace(1) %ptr
@@ -721,19 +669,17 @@ define <3 x i64> @v3i64_func_void() #0 {
 define <4 x i64> @v4i64_func_void() #0 {
 ; GFX789-LABEL: v4i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v4i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -741,7 +687,7 @@ define <4 x i64> @v4i64_func_void() #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <4 x i64>, ptr addrspace(1) %ptr
@@ -751,7 +697,6 @@ define <4 x i64> @v4i64_func_void() #0 {
 define <5 x i64> @v5i64_func_void() #0 {
 ; GFX789-LABEL: v5i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -759,12 +704,11 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GFX789-NEXT:    buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v5i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -773,7 +717,7 @@ define <5 x i64> @v5i64_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
 ; GFX11-NEXT:    buffer_load_b64 v[8:9], off, s[0:3], 0 offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <5 x i64>, ptr addrspace(1) %ptr
@@ -783,7 +727,6 @@ define <5 x i64> @v5i64_func_void() #0 {
 define <8 x i64> @v8i64_func_void() #0 {
 ; GFX789-LABEL: v8i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -792,12 +735,11 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
 ; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
 ; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v8i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -807,7 +749,7 @@ define <8 x i64> @v8i64_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i64>, ptr addrspace(1) %ptr
@@ -817,7 +759,6 @@ define <8 x i64> @v8i64_func_void() #0 {
 define <16 x i64> @v16i64_func_void() #0 {
 ; GFX789-LABEL: v16i64_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -830,12 +771,11 @@ define <16 x i64> @v16i64_func_void() #0 {
 ; GFX789-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
 ; GFX789-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX789-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v16i64_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -849,7 +789,7 @@ define <16 x i64> @v16i64_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
 ; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i64>, ptr addrspace(1) %ptr
@@ -859,30 +799,28 @@ define <16 x i64> @v16i64_func_void() #0 {
 define <2 x i16> @v2i16_func_void() #0 {
 ; CI-LABEL: v2i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v2i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i16>, ptr addrspace(1) undef
   ret <2 x i16> %val
@@ -891,7 +829,6 @@ define <2 x i16> @v2i16_func_void() #0 {
 define <3 x i16> @v3i16_func_void() #0 {
 ; CI-LABEL: v3i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
@@ -899,24 +836,23 @@ define <3 x i16> @v3i16_func_void() #0 {
 ; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; CI-NEXT:    v_mov_b32_e32 v0, v2
 ; CI-NEXT:    v_mov_b32_e32 v2, v3
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v3i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i16>, ptr addrspace(1) undef
   ret <3 x i16> %val
@@ -925,7 +861,6 @@ define <3 x i16> @v3i16_func_void() #0 {
 define <4 x i16> @v4i16_func_void() #0 {
 ; CI-LABEL: v4i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
@@ -934,24 +869,23 @@ define <4 x i16> @v4i16_func_void() #0 {
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; CI-NEXT:    v_mov_b32_e32 v2, v1
 ; CI-NEXT:    v_mov_b32_e32 v1, v4
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v4i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v4i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i16>, ptr addrspace(1) undef
   ret <4 x i16> %val
@@ -960,7 +894,6 @@ define <4 x i16> @v4i16_func_void() #0 {
 define <4 x half> @v4f16_func_void() #0 {
 ; CI-LABEL: v4f16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
@@ -971,24 +904,23 @@ define <4 x half> @v4f16_func_void() #0 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v4f16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v4f16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x half>, ptr addrspace(1) undef
   ret <4 x half> %val
@@ -999,7 +931,6 @@ define <4 x half> @v4f16_func_void() #0 {
 define <5 x i16> @v5i16_func_void() #0 {
 ; CI-LABEL: v5i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1011,29 +942,27 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; CI-NEXT:    v_mov_b32_e32 v2, v1
 ; CI-NEXT:    v_mov_b32_e32 v1, v5
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v5i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v5i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <5 x i16>, ptr addrspace(1) %ptr
@@ -1043,7 +972,6 @@ define <5 x i16> @v5i16_func_void() #0 {
 define <8 x i16> @v8i16_func_void() #0 {
 ; CI-LABEL: v8i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1058,28 +986,27 @@ define <8 x i16> @v8i16_func_void() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v2, v9
 ; CI-NEXT:    v_mov_b32_e32 v4, v10
 ; CI-NEXT:    v_mov_b32_e32 v6, v11
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v8i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v8i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i16>, ptr addrspace(1) %ptr
@@ -1089,7 +1016,6 @@ define <8 x i16> @v8i16_func_void() #0 {
 define <16 x i16> @v16i16_func_void() #0 {
 ; CI-LABEL: v16i16_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1114,23 +1040,22 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; CI-NEXT:    v_mov_b32_e32 v10, v19
 ; CI-NEXT:    v_mov_b32_e32 v12, v20
 ; CI-NEXT:    v_mov_b32_e32 v14, v21
+; CI-NEXT:    s_waitcnt expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v16i16_func_void:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX89-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX89-NEXT:    s_mov_b32 s6, -1
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX89-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v16i16_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1138,7 +1063,7 @@ define <16 x i16> @v16i16_func_void() #0 {
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i16>, ptr addrspace(1) %ptr
@@ -1149,7 +1074,6 @@ define <16 x i16> @v16i16_func_void() #0 {
 define <16 x i8> @v16i8_func_void() #0 {
 ; GFX789-LABEL: v16i8_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -1174,11 +1098,11 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX789-NEXT:    v_mov_b32_e32 v1, v16
 ; GFX789-NEXT:    v_mov_b32_e32 v2, v17
 ; GFX789-NEXT:    v_mov_b32_e32 v3, v18
+; GFX789-NEXT:    s_waitcnt expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v16i8_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1201,6 +1125,7 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v8, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v17
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1211,7 +1136,6 @@ define <16 x i8> @v16i8_func_void() #0 {
 define <4  x i8> @v4i8_func_void() #0 {
 ; GFX789-LABEL: v4i8_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
@@ -1221,11 +1145,11 @@ define <4  x i8> @v4i8_func_void() #0 {
 ; GFX789-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX789-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX789-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX789-NEXT:    s_waitcnt expcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v4i8_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1235,6 +1159,7 @@ define <4  x i8> @v4i8_func_void() #0 {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <4  x i8>, ptr addrspace(1) %ptr
@@ -1244,23 +1169,21 @@ define <4  x i8> @v4i8_func_void() #0 {
 define {i8, i32} @struct_i8_i32_func_void() #0 {
 ; GFX789-LABEL: struct_i8_i32_func_void:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
 ; GFX789-NEXT:    buffer_load_dword v1, off, s[4:7], 0
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: struct_i8_i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load { i8, i32 }, ptr addrspace(1) undef
   ret { i8, i32 } %val
@@ -1269,7 +1192,6 @@ define {i8, i32} @struct_i8_i32_func_void() #0 {
 define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %arg0) #0 {
 ; GFX789-LABEL: void_func_sret_struct_i8_i32:
 ; GFX789:       ; %bb.0:
-; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX789-NEXT:    s_mov_b32 s6, -1
 ; GFX789-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 glc
@@ -1278,12 +1200,11 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
 ; GFX789-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_sret_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_load_u8 v1, off, s[0:3], 0 glc dlc
@@ -1293,6 +1214,7 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b8 v0, v1, off
 ; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile i8, ptr addrspace(1) undef
   %val1 = load volatile i32, ptr addrspace(1) undef
@@ -1309,7 +1231,6 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
 define <33 x i32> @v33i32_func_void() #0 {
 ; CI-LABEL: v33i32_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1394,12 +1315,11 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
 ; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v33i32_func_void:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -1484,12 +1404,11 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 4, v0
 ; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v33i32_func_void:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1545,12 +1464,11 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v33i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1593,6 +1511,7 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1602,7 +1521,6 @@ define <33 x i32> @v33i32_func_void() #0 {
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; CI-LABEL: struct_v32i32_i32_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1687,12 +1605,11 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
 ; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: struct_v32i32_i32_func_void:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -1777,12 +1694,11 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 4, v0
 ; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: struct_v32i32_i32_func_void:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -1838,12 +1754,11 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: struct_v32i32_i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -1886,6 +1801,7 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -1895,7 +1811,6 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; CI-LABEL: struct_i32_v32i32_func_void:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
@@ -1980,12 +1895,11 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
 ; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: struct_i32_v32i32_func_void:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
@@ -2070,12 +1984,11 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
 ; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: struct_i32_v32i32_func_void:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
@@ -2131,12 +2044,11 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
 ; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:132
 ; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: struct_i32_v32i32_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -2179,6 +2091,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
@@ -2189,48 +2102,44 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
 ; CI-LABEL: v3i32_struct_func_void_wasted_reg:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_read_b32 v0, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_read_b32 v1, v0
 ; CI-NEXT:    ds_read_b32 v2, v0
 ; CI-NEXT:    ds_read_b32 v3, v0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v3i32_struct_func_void_wasted_reg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_read_b32 v1, v0
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    ds_read_b32 v3, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v3i32_struct_func_void_wasted_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v1, v0
 ; GFX9-NEXT:    ds_read_b32 v2, v0
 ; GFX9-NEXT:    ds_read_b32 v3, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3i32_struct_func_void_wasted_reg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v1, v0
 ; GFX11-NEXT:    ds_load_b32 v2, v0
 ; GFX11-NEXT:    ds_load_b32 v3, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load volatile i32, ptr addrspace(3) undef
   %load1 = load volatile i32, ptr addrspace(3) undef
@@ -2248,48 +2157,44 @@ define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
 define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
 ; CI-LABEL: v3f32_struct_func_void_wasted_reg:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_read_b32 v0, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_read_b32 v1, v0
 ; CI-NEXT:    ds_read_b32 v2, v0
 ; CI-NEXT:    ds_read_b32 v3, v0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v3f32_struct_func_void_wasted_reg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_read_b32 v1, v0
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    ds_read_b32 v3, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v3f32_struct_func_void_wasted_reg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v0, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v1, v0
 ; GFX9-NEXT:    ds_read_b32 v2, v0
 ; GFX9-NEXT:    ds_read_b32 v3, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v3f32_struct_func_void_wasted_reg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v0, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b32 v1, v0
 ; GFX11-NEXT:    ds_load_b32 v2, v0
 ; GFX11-NEXT:    ds_load_b32 v3, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load volatile float, ptr addrspace(3) undef
   %load1 = load volatile float, ptr addrspace(3) undef
@@ -2307,49 +2212,45 @@ define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
 define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) #0 {
 ; CI-LABEL: void_func_sret_max_known_zero_bits:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    ds_write_b32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    ds_write_b32 v0, v0
 ; CI-NEXT:    ds_write_b32 v0, v0
-; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: void_func_sret_max_known_zero_bits:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write_b32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    ds_write_b32 v0, v0
 ; GFX8-NEXT:    ds_write_b32 v0, v0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: void_func_sret_max_known_zero_bits:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    ds_write_b32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    ds_write_b32 v0, v0
 ; GFX9-NEXT:    ds_write_b32 v0, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_sret_max_known_zero_bits:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    ds_store_b32 v0, v1
 ; GFX11-NEXT:    ds_store_b32 v0, v0
 ; GFX11-NEXT:    ds_store_b32 v0, v2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32
 
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index 9ab8be0485eddbe..3a55676b8e42f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -7,9 +7,9 @@ declare void @extern_c_func()
 define amdgpu_gfx void @gfx_func() {
 ; SDAG-LABEL: gfx_func:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s36, s33
 ; SDAG-NEXT:    s_mov_b32 s33, s32
+; SDAG-NEXT:    s_waitcnt expcnt(0)
 ; SDAG-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; SDAG-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; SDAG-NEXT:    s_mov_b64 exec, s[34:35]
@@ -84,9 +84,9 @@ define amdgpu_gfx void @gfx_func() {
 ;
 ; GISEL-LABEL: gfx_func:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s36, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f827a78125b7785..6f948202211dbb4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -109,9 +109,9 @@ declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_stru
 define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -137,9 +137,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -167,9 +167,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -196,9 +196,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -230,9 +230,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -260,9 +260,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_signext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -292,9 +292,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -322,9 +322,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -359,9 +359,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -389,9 +389,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -421,9 +421,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -451,9 +451,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -488,9 +488,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -515,9 +515,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -544,9 +544,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -572,9 +572,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -605,9 +605,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -633,9 +633,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -663,9 +663,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -692,9 +692,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -727,9 +727,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -755,9 +755,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -785,9 +785,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -814,9 +814,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -849,9 +849,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -876,9 +876,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -905,9 +905,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -933,9 +933,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -966,9 +966,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_signext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -994,9 +994,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1024,9 +1024,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_signext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1053,9 +1053,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1088,9 +1088,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1116,9 +1116,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1146,9 +1146,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1175,9 +1175,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1210,9 +1210,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1237,9 +1237,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1266,9 +1266,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1294,9 +1294,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1327,9 +1327,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i64_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1355,9 +1355,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1385,9 +1385,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i64_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1413,9 +1413,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1447,9 +1447,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1476,9 +1476,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1507,9 +1507,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1537,9 +1537,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1573,9 +1573,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1603,9 +1603,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1635,9 +1635,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1664,9 +1664,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1700,9 +1700,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1731,9 +1731,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1764,9 +1764,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1794,9 +1794,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1834,9 +1834,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1867,9 +1867,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1902,9 +1902,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1933,9 +1933,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1974,9 +1974,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f16_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2001,9 +2001,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f16_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2030,9 +2030,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f16_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2058,9 +2058,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2091,9 +2091,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2118,9 +2118,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2147,9 +2147,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2175,9 +2175,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2208,9 +2208,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2236,9 +2236,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2266,9 +2266,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2294,9 +2294,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2328,9 +2328,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2357,9 +2357,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2388,9 +2388,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2417,9 +2417,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2452,9 +2452,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2483,9 +2483,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2516,9 +2516,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2546,9 +2546,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2583,9 +2583,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f64_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2611,9 +2611,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2641,9 +2641,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f64_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2669,9 +2669,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2703,9 +2703,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2733,9 +2733,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2765,9 +2765,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2794,9 +2794,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2830,9 +2830,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2862,9 +2862,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2896,9 +2896,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -2926,9 +2926,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2964,9 +2964,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -2995,9 +2995,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3028,9 +3028,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3060,9 +3060,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3098,9 +3098,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -3130,9 +3130,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3164,9 +3164,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3196,9 +3196,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3235,9 +3235,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -3268,9 +3268,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3303,9 +3303,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3336,9 +3336,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3376,9 +3376,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -3411,9 +3411,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3448,9 +3448,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3483,9 +3483,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3525,9 +3525,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -3563,9 +3563,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3603,9 +3603,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3640,9 +3640,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3685,9 +3685,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -3755,9 +3755,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3828,9 +3828,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -3893,9 +3893,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3972,9 +3972,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4007,9 +4007,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4044,9 +4044,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -4080,9 +4080,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4124,9 +4124,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4163,9 +4163,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4204,9 +4204,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -4246,9 +4246,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4294,9 +4294,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4337,9 +4337,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4382,9 +4382,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -4429,9 +4429,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4481,9 +4481,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4525,9 +4525,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4571,9 +4571,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -4623,9 +4623,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4676,9 +4676,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4725,9 +4725,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4776,9 +4776,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -4833,9 +4833,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -4891,9 +4891,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -4945,9 +4945,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5001,9 +5001,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -5065,9 +5065,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5128,9 +5128,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i8_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -5249,9 +5249,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i8_ret:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5373,9 +5373,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i8_ret:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -5523,9 +5523,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5655,9 +5655,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -5682,9 +5682,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5711,9 +5711,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -5739,9 +5739,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5773,9 +5773,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -5800,9 +5800,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5829,9 +5829,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -5857,9 +5857,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5891,9 +5891,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -5918,9 +5918,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -5947,9 +5947,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -5975,9 +5975,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6009,9 +6009,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6037,9 +6037,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6067,9 +6067,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6095,9 +6095,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6129,9 +6129,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6157,9 +6157,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6187,9 +6187,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6216,9 +6216,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6250,9 +6250,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6277,9 +6277,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6306,9 +6306,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6334,9 +6334,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6368,9 +6368,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6396,9 +6396,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6426,9 +6426,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6455,9 +6455,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6489,9 +6489,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6516,9 +6516,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6545,9 +6545,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6573,9 +6573,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6607,9 +6607,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6634,9 +6634,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6663,9 +6663,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6691,9 +6691,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6725,9 +6725,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6753,9 +6753,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6783,9 +6783,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6811,9 +6811,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6845,9 +6845,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6874,9 +6874,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6905,9 +6905,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -6934,9 +6934,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -6969,9 +6969,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -6999,9 +6999,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7031,9 +7031,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7060,9 +7060,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7096,9 +7096,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7123,9 +7123,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7152,9 +7152,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7180,9 +7180,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7214,9 +7214,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7244,9 +7244,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7276,9 +7276,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7305,9 +7305,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7341,9 +7341,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7372,9 +7372,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7405,9 +7405,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7435,9 +7435,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7472,9 +7472,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7503,9 +7503,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7537,9 +7537,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7570,9 +7570,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7610,9 +7610,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7644,9 +7644,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7680,9 +7680,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7711,9 +7711,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7751,9 +7751,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7784,9 +7784,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7820,9 +7820,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -7855,9 +7855,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7897,9 +7897,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -7935,9 +7935,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -7975,9 +7975,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8014,9 +8014,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8060,9 +8060,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8101,9 +8101,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8144,9 +8144,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8185,9 +8185,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8235,9 +8235,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8270,9 +8270,9 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8308,9 +8308,9 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ;
 ; GFX11-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8344,9 +8344,9 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_i32_func_i32_imm:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8387,9 +8387,9 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8418,9 +8418,9 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8452,9 +8452,9 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8485,9 +8485,9 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8525,9 +8525,9 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8556,9 +8556,9 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8589,9 +8589,9 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8621,9 +8621,9 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8663,9 +8663,9 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8703,9 +8703,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ;
 ; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8746,9 +8746,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ;
 ; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8786,9 +8786,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8846,9 +8846,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -8895,9 +8895,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -8947,9 +8947,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -8994,9 +8994,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9052,9 +9052,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-LABEL: tail_call_byval_align16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -9143,9 +9143,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; GFX10-LABEL: tail_call_byval_align16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9237,9 +9237,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; GFX11-LABEL: tail_call_byval_align16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s4, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -9326,9 +9326,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: tail_call_byval_align16:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9424,9 +9424,9 @@ entry:
 define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9452,9 +9452,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9482,9 +9482,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9511,9 +9511,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9545,9 +9545,9 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9574,9 +9574,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9605,9 +9605,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9635,9 +9635,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9670,9 +9670,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9699,9 +9699,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9730,9 +9730,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9760,9 +9760,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9795,9 +9795,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9824,9 +9824,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9855,9 +9855,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9885,9 +9885,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9920,9 +9920,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9952,9 +9952,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9986,9 +9986,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10019,9 +10019,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10057,9 +10057,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10093,9 +10093,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10131,9 +10131,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10168,9 +10168,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10211,9 +10211,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10249,9 +10249,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10289,9 +10289,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10328,9 +10328,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10372,9 +10372,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10414,9 +10414,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10458,9 +10458,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10501,9 +10501,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10552,9 +10552,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10600,9 +10600,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10650,9 +10650,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10699,9 +10699,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10755,9 +10755,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10784,9 +10784,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10815,9 +10815,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10845,9 +10845,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10880,9 +10880,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10909,9 +10909,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10940,9 +10940,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10970,9 +10970,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11005,9 +11005,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11037,9 +11037,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11071,9 +11071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11104,9 +11104,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11142,9 +11142,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11177,9 +11177,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11214,9 +11214,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11250,9 +11250,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11291,9 +11291,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11332,9 +11332,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11375,9 +11375,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11417,9 +11417,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11464,9 +11464,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11496,9 +11496,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11530,9 +11530,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11563,9 +11563,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11601,9 +11601,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11639,9 +11639,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11679,9 +11679,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11718,9 +11718,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11762,9 +11762,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11806,9 +11806,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11852,9 +11852,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11897,9 +11897,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11947,9 +11947,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11976,9 +11976,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12007,9 +12007,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12037,9 +12037,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12073,9 +12073,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12104,9 +12104,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12137,9 +12137,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12169,9 +12169,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12207,9 +12207,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12238,9 +12238,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12271,9 +12271,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12303,9 +12303,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12341,9 +12341,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12373,9 +12373,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12407,9 +12407,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12440,9 +12440,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12478,9 +12478,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12510,9 +12510,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12544,9 +12544,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12577,9 +12577,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12615,9 +12615,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12646,9 +12646,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12679,9 +12679,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12711,9 +12711,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12749,9 +12749,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12781,9 +12781,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12815,9 +12815,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12848,9 +12848,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12886,9 +12886,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12915,9 +12915,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12946,9 +12946,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12976,9 +12976,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13012,9 +13012,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13043,9 +13043,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13076,9 +13076,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13108,9 +13108,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13146,9 +13146,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13178,9 +13178,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13212,9 +13212,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13245,9 +13245,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13283,9 +13283,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13318,9 +13318,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13355,9 +13355,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13391,9 +13391,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13432,9 +13432,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13470,9 +13470,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13510,9 +13510,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13549,9 +13549,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13593,9 +13593,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13628,9 +13628,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13665,9 +13665,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13701,9 +13701,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13743,9 +13743,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13781,9 +13781,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13821,9 +13821,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13860,9 +13860,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13904,9 +13904,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13945,9 +13945,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13988,9 +13988,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14030,9 +14030,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14077,9 +14077,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14122,9 +14122,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14169,9 +14169,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14215,9 +14215,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14268,9 +14268,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14318,9 +14318,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14370,9 +14370,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14421,9 +14421,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14477,9 +14477,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14538,9 +14538,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14601,9 +14601,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14663,9 +14663,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14732,9 +14732,9 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14837,9 +14837,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14945,9 +14945,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -15047,9 +15047,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15158,9 +15158,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -15268,9 +15268,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ;
 ; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15381,9 +15381,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ;
 ; GFX11-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -15487,9 +15487,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15605,9 +15605,9 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -15637,9 +15637,9 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ;
 ; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15672,9 +15672,9 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ;
 ; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -15701,9 +15701,9 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ;
 ; GFX10-SCRATCH-LABEL: stack_passed_arg_alignment_v32i32_f64:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15737,9 +15737,9 @@ entry:
 define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-LABEL: stack_12xv3i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -15803,9 +15803,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ;
 ; GFX10-LABEL: stack_12xv3i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15871,9 +15871,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ;
 ; GFX11-LABEL: stack_12xv3i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -15917,9 +15917,9 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: stack_12xv3i32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -15999,9 +15999,9 @@ entry:
 define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-LABEL: stack_8xv5i32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -16073,9 +16073,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ;
 ; GFX10-LABEL: stack_8xv5i32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -16149,9 +16149,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ;
 ; GFX11-LABEL: stack_8xv5i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -16199,9 +16199,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: stack_8xv5i32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -16283,9 +16283,9 @@ entry:
 define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-LABEL: stack_8xv5f32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -16357,9 +16357,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ;
 ; GFX10-LABEL: stack_8xv5f32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -16433,9 +16433,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ;
 ; GFX11-LABEL: stack_8xv5f32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -16489,9 +16489,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ;
 ; GFX10-SCRATCH-LABEL: stack_8xv5f32:
 ; GFX10-SCRATCH:       ; %bb.0: ; %entry
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index ad55d49a1a96d20..cabdd51fb3ef2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -8,9 +8,9 @@ declare hidden amdgpu_gfx void @external_void_func_void() #0
 define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
 ; GFX9-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -41,9 +41,9 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ;
 ; GFX10-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -76,9 +76,9 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ;
 ; GFX11-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -116,7 +116,7 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s28_s29:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -137,12 +137,12 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s28_s29:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -165,12 +165,12 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_void_clobber_s28_s29:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -192,7 +192,7 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
   call void asm sideeffect "; clobber", "~{s[28:29]}"() #0
@@ -202,9 +202,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 {
 define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -238,9 +238,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ;
 ; GFX10-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -276,9 +276,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_void_func_void_mayclobber_s31:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -318,9 +318,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -354,9 +354,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ;
 ; GFX10-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -392,9 +392,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_void_func_void_mayclobber_v31:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -436,9 +436,9 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -472,9 +472,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -510,9 +510,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_void_func_void_preserves_s33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -553,9 +553,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -589,9 +589,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -627,9 +627,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_void_func_void_preserves_s34:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -670,9 +670,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 {
 ; GFX9-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -704,9 +704,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ;
 ; GFX10-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -740,9 +740,9 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ;
 ; GFX11-LABEL: test_call_void_func_void_preserves_v40:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -780,7 +780,7 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 define hidden void @void_func_void_clobber_s33() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -792,12 +792,12 @@ define hidden void @void_func_void_clobber_s33() #1 {
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -811,12 +811,12 @@ define hidden void @void_func_void_clobber_s33() #1 {
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_void_clobber_s33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -829,7 +829,7 @@ define hidden void @void_func_void_clobber_s33() #1 {
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s33}"() #0
   ret void
@@ -838,7 +838,7 @@ define hidden void @void_func_void_clobber_s33() #1 {
 define hidden void @void_func_void_clobber_s34() #1 {
 ; GFX9-LABEL: void_func_void_clobber_s34:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -850,12 +850,12 @@ define hidden void @void_func_void_clobber_s34() #1 {
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: void_func_void_clobber_s34:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -869,12 +869,12 @@ define hidden void @void_func_void_clobber_s34() #1 {
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: void_func_void_clobber_s34:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -887,7 +887,7 @@ define hidden void @void_func_void_clobber_s34() #1 {
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber", "~{s34}"() #0
   ret void
@@ -896,9 +896,9 @@ define hidden void @void_func_void_clobber_s34() #1 {
 define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-LABEL: test_call_void_func_void_clobber_s33:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -922,9 +922,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ;
 ; GFX10-LABEL: test_call_void_func_void_clobber_s33:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -950,9 +950,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ;
 ; GFX11-LABEL: test_call_void_func_void_clobber_s33:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -981,9 +981,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-LABEL: test_call_void_func_void_clobber_s34:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1007,9 +1007,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ;
 ; GFX10-LABEL: test_call_void_func_void_clobber_s34:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1035,9 +1035,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ;
 ; GFX11-LABEL: test_call_void_func_void_clobber_s34:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1066,9 +1066,9 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-LABEL: callee_saved_sgpr_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1101,9 +1101,9 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ;
 ; GFX10-LABEL: callee_saved_sgpr_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1138,9 +1138,9 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ;
 ; GFX11-LABEL: callee_saved_sgpr_kernel:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -1180,9 +1180,9 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-LABEL: callee_saved_sgpr_vgpr_kernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -1224,9 +1224,9 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ;
 ; GFX10-LABEL: callee_saved_sgpr_vgpr_kernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -1270,9 +1270,9 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ;
 ; GFX11-LABEL: callee_saved_sgpr_vgpr_kernel:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 0d54da3128a617a..ac4f2924900e7ca 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -6,14 +6,14 @@
 define amdgpu_gfx i1 @return_i1() #0 {
 ; GFX9-LABEL: return_i1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: return_i1:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret i1 1
@@ -22,9 +22,9 @@ entry:
 define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-LABEL: call_i1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -46,9 +46,9 @@ define amdgpu_gfx void @call_i1() #0 {
 ;
 ; GFX10-LABEL: call_i1:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -72,9 +72,9 @@ define amdgpu_gfx void @call_i1() #0 {
 ;
 ; GFX11-LABEL: call_i1:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -102,14 +102,14 @@ entry:
 define amdgpu_gfx i16 @return_i16() #0 {
 ; GFX9-LABEL: return_i16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: return_i16:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 10
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret i16 10
@@ -118,9 +118,9 @@ entry:
 define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-LABEL: call_i16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -142,9 +142,9 @@ define amdgpu_gfx void @call_i16() #0 {
 ;
 ; GFX10-LABEL: call_i16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -168,9 +168,9 @@ define amdgpu_gfx void @call_i16() #0 {
 ;
 ; GFX11-LABEL: call_i16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -198,14 +198,14 @@ entry:
 define amdgpu_gfx <2 x i16> @return_2xi16() #0 {
 ; GFX9-LABEL: return_2xi16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: return_2xi16:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <2 x i16> <i16 1, i16 2>
@@ -214,9 +214,9 @@ entry:
 define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-LABEL: call_2xi16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -238,9 +238,9 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ;
 ; GFX10-LABEL: call_2xi16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -264,9 +264,9 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ;
 ; GFX11-LABEL: call_2xi16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -294,22 +294,22 @@ entry:
 define amdgpu_gfx <3 x i16> @return_3xi16() #0 {
 ; GFX9-LABEL: return_3xi16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: return_3xi16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: return_3xi16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <3 x i16> <i16 1, i16 2, i16 3>
@@ -318,9 +318,9 @@ entry:
 define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-LABEL: call_3xi16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -342,9 +342,9 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ;
 ; GFX10-LABEL: call_3xi16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -368,9 +368,9 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ;
 ; GFX11-LABEL: call_3xi16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -400,7 +400,6 @@ entry:
 define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX9-LABEL: return_100xi32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -501,11 +500,11 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v97, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v98, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v99, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: return_100xi32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
@@ -606,11 +605,11 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v97, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v98, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v99, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: return_100xi32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0
@@ -661,6 +660,7 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v94, 0 :: v_dual_mov_b32 v95, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v96, 0 :: v_dual_mov_b32 v97, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v98, 0 :: v_dual_mov_b32 v99, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <100 x i32> zeroinitializer
 }
@@ -668,9 +668,9 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-LABEL: call_100xi32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -756,9 +756,9 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ;
 ; GFX10-LABEL: call_100xi32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -847,9 +847,9 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ;
 ; GFX11-LABEL: call_100xi32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -944,7 +944,6 @@ entry:
 define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX9-LABEL: return_512xi32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044
@@ -1458,12 +1457,11 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: return_512xi32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:1020
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044
@@ -1977,11 +1975,11 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: return_512xi32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@@ -2246,6 +2244,7 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_add_i32 s0, s0, 16
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2254,10 +2253,10 @@ entry:
 define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-LABEL: call_512xi32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x1ffc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffe0000
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2280,10 +2279,10 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ;
 ; GFX10-LABEL: call_512xi32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0xffe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffff0000
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2308,11 +2307,11 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ;
 ; GFX11-LABEL: call_512xi32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x7ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v5, s33 offset:2048 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -2343,7 +2342,6 @@ entry:
 define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX9-LABEL: return_72xi32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:160
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:284
@@ -2498,12 +2496,11 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: return_72xi32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -2637,12 +2634,11 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x10
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:220
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:216
@@ -2778,7 +2774,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:212
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:216
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:220
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <72 x i32> %val
 }
@@ -2786,10 +2782,10 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-LABEL: call_72xi32:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3057,10 +3053,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ;
 ; GFX10-LABEL: call_72xi32:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s36, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -3333,11 +3329,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ;
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s45, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index c83fb79fde63d12..8be8d2472e26e0b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -10,7 +10,6 @@
 define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_xchg_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -24,18 +23,20 @@ define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_xchg_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -44,7 +45,6 @@ define void @global_atomic_xchg_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_xchg_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -58,21 +58,22 @@ define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; VI-LABEL: global_atomic_xchg_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
@@ -82,7 +83,6 @@ define void @global_atomic_xchg_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_xchg_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -91,24 +91,26 @@ define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -117,7 +119,6 @@ define i32 @global_atomic_xchg_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_xchg_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -126,27 +127,28 @@ define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
@@ -156,7 +158,7 @@ define i32 @global_atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -181,7 +183,6 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_xchg_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -189,17 +190,18 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -208,7 +210,7 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -233,7 +235,6 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -243,17 +244,18 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
@@ -263,7 +265,7 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -288,7 +290,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_xchg_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -296,17 +297,18 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -315,7 +317,7 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -340,7 +342,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -350,17 +351,18 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, i32 %in seq_cst
@@ -433,7 +435,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -447,18 +448,20 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
 ;
 ; VI-LABEL: global_atomic_xchg_f32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
   ret void
@@ -530,7 +533,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -544,21 +546,22 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i
 ;
 ; VI-LABEL: global_atomic_xchg_f32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
@@ -633,7 +636,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -642,24 +644,26 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_f32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
   ret float %result
@@ -735,7 +739,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -744,27 +747,28 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in
 ; SI-NEXT:    buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_f32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
@@ -848,7 +852,7 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -873,7 +877,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_xchg_f32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -881,17 +884,18 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
   ret void
@@ -978,7 +982,7 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1003,7 +1007,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -1013,17 +1016,18 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    flat_atomic_swap v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
@@ -1110,7 +1114,7 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1135,7 +1139,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_xchg_f32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -1143,17 +1146,18 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, float %in seq_cst
   ret float %result
@@ -1243,7 +1247,7 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1268,7 +1272,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -1278,17 +1281,18 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_swap v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr float, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, float %in seq_cst
@@ -1302,7 +1306,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(
 define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_add_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1316,18 +1319,20 @@ define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_add_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -1336,7 +1341,6 @@ define void @global_atomic_add_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_add_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1350,21 +1354,22 @@ define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; VI-LABEL: global_atomic_add_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1374,7 +1379,6 @@ define void @global_atomic_add_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_add_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1383,24 +1387,26 @@ define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_add_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1409,7 +1415,6 @@ define i32 @global_atomic_add_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_add_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1418,27 +1423,28 @@ define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_add_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1448,7 +1454,7 @@ define i32 @global_atomic_add_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_add_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1473,7 +1479,6 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_add_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -1481,17 +1486,18 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_add v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -1500,7 +1506,7 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_add_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1525,7 +1531,6 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_add_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -1535,17 +1540,18 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_add v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1555,7 +1561,7 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_add_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1580,7 +1586,6 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_add_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -1588,17 +1593,18 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1607,7 +1613,7 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_add_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1632,7 +1638,6 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_add_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -1642,17 +1647,18 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_add v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw add ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1666,7 +1672,6 @@ define amdgpu_gfx i32 @global_atomic_add_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_sub_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1680,18 +1685,20 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_sub_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -1700,7 +1707,6 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_sub_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1714,21 +1720,22 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; VI-LABEL: global_atomic_sub_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1738,7 +1745,6 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_sub_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1747,24 +1753,26 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_sub_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1773,7 +1781,6 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_sub_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1782,27 +1789,28 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_sub_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1812,7 +1820,7 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1837,7 +1845,6 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_sub_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -1845,17 +1852,18 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_sub v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -1864,7 +1872,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1889,7 +1897,6 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_sub_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -1899,17 +1906,18 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_sub v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -1919,7 +1927,7 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1944,7 +1952,6 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_sub_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -1952,17 +1959,18 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -1971,7 +1979,7 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1996,7 +2004,6 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_sub_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -2006,17 +2013,18 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_sub v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst
@@ -2030,7 +2038,6 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_and_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2044,18 +2051,20 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_and_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -2064,7 +2073,6 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_and_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2078,21 +2086,22 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; VI-LABEL: global_atomic_and_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -2102,7 +2111,6 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_and_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2111,24 +2119,26 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_and_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2137,7 +2147,6 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_and_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2146,27 +2155,28 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_and_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -2176,7 +2186,7 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_and_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2201,7 +2211,6 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_and_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -2209,17 +2218,18 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_and v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -2228,7 +2238,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_and_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2253,7 +2263,6 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_and_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -2263,17 +2272,18 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_and v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -2283,7 +2293,7 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_and_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2308,7 +2318,6 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_and_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -2316,17 +2325,18 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -2335,7 +2345,7 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_and_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2360,7 +2370,6 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_and_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -2370,17 +2379,18 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_and v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst
@@ -2394,7 +2404,6 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_nand_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2406,13 +2415,13 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, v4, v2
 ; SI-NEXT:    v_not_b32_e32 v3, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -2420,12 +2429,10 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB40_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -2440,6 +2447,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB40_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2448,7 +2456,6 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -2463,6 +2470,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2475,7 +2483,6 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_nand_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2487,13 +2494,13 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, v4, v2
 ; SI-NEXT:    v_not_b32_e32 v3, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -2501,12 +2508,10 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB41_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
@@ -2523,6 +2528,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB41_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2531,7 +2537,6 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB41_1: ; %atomicrmw.start
@@ -2546,6 +2551,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2559,7 +2565,6 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_nand_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2570,7 +2575,6 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, v5, v2
 ; SI-NEXT:    v_not_b32_e32 v4, v3
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
@@ -2579,6 +2583,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -2586,12 +2591,10 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -2606,6 +2609,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB42_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2615,7 +2619,6 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -2630,6 +2633,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2643,7 +2647,6 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_nand_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2654,7 +2657,6 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, v5, v2
 ; SI-NEXT:    v_not_b32_e32 v4, v3
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
@@ -2663,6 +2665,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -2670,12 +2673,10 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[3:4]
@@ -2692,6 +2693,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB43_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2700,7 +2702,6 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB43_1: ; %atomicrmw.start
@@ -2715,6 +2716,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2729,7 +2731,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2770,7 +2772,6 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_nand_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
@@ -2787,6 +2788,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB44_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2795,7 +2797,6 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -2811,6 +2812,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2823,7 +2825,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2864,7 +2866,6 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_nand_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -2883,6 +2884,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB45_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2891,7 +2893,6 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -2907,6 +2908,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2920,7 +2922,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2962,7 +2964,6 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_nand_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
@@ -2981,6 +2982,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB46_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2989,7 +2991,6 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -3005,6 +3006,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3017,7 +3019,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3059,7 +3061,6 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_nand_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s34
@@ -3078,6 +3079,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB47_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3086,7 +3088,6 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_nand_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -3102,6 +3103,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3119,7 +3121,6 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_or_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3133,18 +3134,20 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_or_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -3153,7 +3156,6 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_or_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3167,21 +3169,22 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_or_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3191,7 +3194,6 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_or_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3200,24 +3202,26 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_or_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -3226,7 +3230,6 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_or_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3235,27 +3238,28 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_or_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3265,7 +3269,7 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_or_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3290,7 +3294,6 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_or_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -3298,17 +3301,18 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    flat_atomic_or v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -3317,7 +3321,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_or_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3342,7 +3346,6 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_or_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -3352,17 +3355,18 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_or v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3372,7 +3376,7 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
 define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_or_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3397,7 +3401,6 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p
 ;
 ; VI-LABEL: global_atomic_or_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -3405,17 +3408,18 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p
 ; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -3424,7 +3428,7 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p
 define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_or_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3449,7 +3453,6 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_or_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -3459,17 +3462,18 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_or v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3483,7 +3487,6 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i
 define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_xor_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3497,18 +3500,20 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_xor_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -3517,7 +3522,6 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_xor_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3531,21 +3535,22 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; VI-LABEL: global_atomic_xor_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3555,7 +3560,6 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_xor_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3564,24 +3568,26 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xor_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -3590,7 +3596,6 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_xor_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3599,27 +3604,28 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xor_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3629,7 +3635,7 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3654,7 +3660,6 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_xor_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -3662,17 +3667,18 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_xor v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -3681,7 +3687,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3706,7 +3712,6 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_xor_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -3716,17 +3721,18 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_xor v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3736,7 +3742,7 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3761,7 +3767,6 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_xor_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -3769,17 +3774,18 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -3788,7 +3794,7 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3813,7 +3819,6 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_xor_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -3823,17 +3828,18 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_xor v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst
@@ -3847,7 +3853,6 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_max_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3858,13 +3863,13 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_max_i32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -3872,12 +3877,10 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB64_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -3891,6 +3894,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB64_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3899,7 +3903,6 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -3913,6 +3916,7 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3925,7 +3929,6 @@ define void @global_atomic_max_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_max_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3936,13 +3939,13 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_max_i32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -3950,12 +3953,10 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB65_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
@@ -3971,6 +3972,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB65_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3979,7 +3981,6 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; GFX9-LABEL: global_atomic_max_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB65_1: ; %atomicrmw.start
@@ -3993,6 +3994,7 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4006,7 +4008,6 @@ define void @global_atomic_max_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_max_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -4017,7 +4018,6 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_max_i32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -4025,6 +4025,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -4032,12 +4033,10 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -4051,6 +4050,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB66_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4060,7 +4060,6 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -4074,6 +4073,7 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4087,7 +4087,6 @@ define i32 @global_atomic_max_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_max_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -4098,7 +4097,6 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_max_i32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -4106,6 +4104,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -4113,12 +4112,10 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[3:4]
@@ -4134,6 +4131,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB67_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4142,7 +4140,6 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB67_1: ; %atomicrmw.start
@@ -4156,6 +4153,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4170,7 +4168,7 @@ define i32 @global_atomic_max_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_max_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4210,7 +4208,6 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_max_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
@@ -4226,6 +4223,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB68_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4234,7 +4232,6 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; GFX9-LABEL: global_atomic_max_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4249,6 +4246,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4261,7 +4259,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_max_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4301,7 +4299,6 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_max_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -4319,6 +4316,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB69_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4327,7 +4325,6 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; GFX9-LABEL: global_atomic_max_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4342,6 +4339,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4355,7 +4353,7 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_max_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4396,7 +4394,6 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_max_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
@@ -4414,6 +4411,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB70_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4422,7 +4420,6 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; GFX9-LABEL: global_atomic_max_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4437,6 +4434,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4449,7 +4447,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_max_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4490,7 +4488,6 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_max_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s34
@@ -4508,6 +4505,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB71_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4516,7 +4514,6 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_max_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4531,6 +4528,7 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4980,7 +4978,6 @@ entry:
 define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_umax_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -4991,13 +4988,13 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_max_u32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -5005,12 +5002,10 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB76_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -5024,6 +5019,7 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB76_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5032,7 +5028,6 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -5046,6 +5041,7 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5058,7 +5054,6 @@ define void @global_atomic_umax_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_umax_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -5069,13 +5064,13 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_max_u32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -5083,12 +5078,10 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB77_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
@@ -5104,6 +5097,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB77_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5112,7 +5106,6 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB77_1: ; %atomicrmw.start
@@ -5126,6 +5119,7 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5139,7 +5133,6 @@ define void @global_atomic_umax_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_umax_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -5150,7 +5143,6 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_max_u32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -5158,6 +5150,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -5165,12 +5158,10 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -5184,6 +5175,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB78_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5193,7 +5185,6 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -5207,6 +5198,7 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5220,7 +5212,6 @@ define i32 @global_atomic_umax_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_umax_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -5231,7 +5222,6 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_max_u32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -5239,6 +5229,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -5246,12 +5237,10 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[3:4]
@@ -5267,6 +5256,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB79_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5275,7 +5265,6 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB79_1: ; %atomicrmw.start
@@ -5289,6 +5278,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5303,7 +5293,7 @@ define i32 @global_atomic_umax_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5343,7 +5333,6 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_umax_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
@@ -5359,6 +5348,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB80_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5367,7 +5357,6 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5382,6 +5371,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB80_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5394,7 +5384,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5434,7 +5424,6 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_umax_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -5452,6 +5441,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB81_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5460,7 +5450,6 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5475,6 +5464,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB81_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5488,7 +5478,7 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5529,7 +5519,6 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_umax_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
@@ -5547,6 +5536,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB82_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5555,7 +5545,6 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5570,6 +5559,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB82_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5582,7 +5572,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5623,7 +5613,6 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_umax_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s34
@@ -5641,6 +5630,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB83_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5649,7 +5639,6 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_umax_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5664,6 +5653,7 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6015,7 +6005,6 @@ entry:
 define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_umin_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6026,13 +6015,13 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_min_u32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6040,12 +6029,10 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB87_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -6059,6 +6046,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB87_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6067,7 +6055,6 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -6081,6 +6068,7 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6093,7 +6081,6 @@ define void @global_atomic_umin_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_umin_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6104,13 +6091,13 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_min_u32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6118,12 +6105,10 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB88_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
@@ -6139,6 +6124,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB88_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6147,7 +6133,6 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
@@ -6161,6 +6146,7 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6174,7 +6160,6 @@ define void @global_atomic_umin_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_umin_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6185,7 +6170,6 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_min_u32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6193,6 +6177,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -6200,12 +6185,10 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -6219,6 +6202,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB89_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6228,7 +6212,6 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -6242,6 +6225,7 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6255,7 +6239,6 @@ define i32 @global_atomic_umin_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_umin_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6266,7 +6249,6 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_min_u32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6274,6 +6256,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -6281,12 +6264,10 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[3:4]
@@ -6302,6 +6283,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB90_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6310,7 +6292,6 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
@@ -6324,6 +6305,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6338,7 +6320,7 @@ define i32 @global_atomic_umin_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6378,7 +6360,6 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_umin_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
@@ -6394,6 +6375,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB91_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6402,7 +6384,6 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6417,6 +6398,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6429,7 +6411,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6469,7 +6451,6 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_umin_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -6487,6 +6468,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB92_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6495,7 +6477,6 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6510,6 +6491,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB92_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6523,7 +6505,7 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6564,7 +6546,6 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_umin_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
@@ -6582,6 +6563,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB93_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6590,7 +6572,6 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6605,6 +6586,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6617,7 +6599,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6658,7 +6640,6 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_umin_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s34
@@ -6676,6 +6657,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB94_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6684,7 +6666,6 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_umin_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6699,6 +6680,7 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB94_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6716,7 +6698,6 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_min_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6727,13 +6708,13 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_min_i32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6741,12 +6722,10 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB95_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -6760,6 +6739,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB95_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6768,7 +6748,6 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -6782,6 +6761,7 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB95_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6794,7 +6774,6 @@ define void @global_atomic_min_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_min_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6805,13 +6784,13 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_min_i32_e32 v3, v4, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v6, v4
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6819,12 +6798,10 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB96_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
@@ -6840,6 +6817,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB96_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6848,7 +6826,6 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ;
 ; GFX9-LABEL: global_atomic_min_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB96_1: ; %atomicrmw.start
@@ -6862,6 +6839,7 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB96_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6875,7 +6853,6 @@ define void @global_atomic_min_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
 define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_min_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6886,7 +6863,6 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_min_i32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6894,6 +6870,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -6901,12 +6878,10 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -6920,6 +6895,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB97_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6929,7 +6905,6 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -6943,6 +6918,7 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB97_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6956,7 +6932,6 @@ define i32 @global_atomic_min_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_min_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6967,7 +6942,6 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_min_i32_e32 v4, v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, v5
@@ -6975,6 +6949,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
@@ -6982,12 +6957,10 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[3:4]
@@ -7003,6 +6976,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB98_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7011,7 +6985,6 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:16
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
@@ -7025,6 +6998,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB98_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7039,7 +7013,7 @@ define i32 @global_atomic_min_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_min_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7079,7 +7053,6 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_min_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
@@ -7095,6 +7068,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB99_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7103,7 +7077,6 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ;
 ; GFX9-LABEL: global_atomic_min_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7118,6 +7091,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB99_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7130,7 +7104,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_min_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7170,7 +7144,6 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_min_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -7188,6 +7161,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB100_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7196,7 +7170,6 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ;
 ; GFX9-LABEL: global_atomic_min_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dword v1, v2, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7211,6 +7184,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7224,7 +7198,7 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_min_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7265,7 +7239,6 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_min_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
@@ -7283,6 +7256,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB101_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7291,7 +7265,6 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; GFX9-LABEL: global_atomic_min_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7306,6 +7279,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7318,7 +7292,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_min_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7359,7 +7333,6 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_min_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s34
@@ -7377,6 +7350,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v4
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB102_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7385,7 +7359,6 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_min_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_dword v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7400,6 +7373,7 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7836,7 +7810,6 @@ entry:
 define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7850,18 +7823,20 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -7870,7 +7845,6 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7884,21 +7858,22 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -7908,7 +7883,6 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7917,24 +7891,26 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -7943,7 +7919,6 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7952,27 +7927,28 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 ; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -7982,7 +7958,7 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8007,7 +7983,6 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -8015,17 +7990,18 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_inc v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -8034,7 +8010,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
 define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8059,7 +8035,6 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -8069,17 +8044,18 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
 ; VI-NEXT:    flat_atomic_inc v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8089,7 +8065,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8114,7 +8090,6 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -8122,17 +8097,18 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -8141,7 +8117,7 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i
 define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8166,7 +8142,6 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -8176,17 +8151,18 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
 ; VI-NEXT:    flat_atomic_inc v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8200,7 +8176,6 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa
 define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8214,18 +8189,20 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -8234,7 +8211,6 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
 define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8248,21 +8224,22 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v[0:1], v2, off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8272,7 +8249,6 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32
 define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8281,24 +8257,26 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 ; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -8307,7 +8285,6 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
 define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8316,27 +8293,28 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 ; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v[0:1], v2, off offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8346,7 +8324,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i
 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8371,7 +8349,6 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -8379,17 +8356,18 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_dec v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v1, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret void
@@ -8398,7 +8376,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8423,7 +8401,6 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -8433,17 +8410,18 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; VI-NEXT:    flat_atomic_dec v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v1, s[4:5] offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
@@ -8453,7 +8431,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) inreg %ptr, i32 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8478,7 +8456,6 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
@@ -8486,17 +8463,18 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v0, v1, s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst
   ret i32 %result
@@ -8505,7 +8483,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i
 define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspace(1) inreg %out, i32 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8530,7 +8508,6 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 16
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s34
@@ -8540,17 +8517,18 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa
 ; VI-NEXT:    flat_atomic_dec v0, v[0:1], v2 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i32, ptr addrspace(1) %out, i32 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 5a827584aa1062a..9c201eb857a9750 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -10,7 +10,6 @@
 define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_xchg_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -24,18 +23,20 @@ define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_xchg_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -44,7 +45,6 @@ define void @global_atomic_xchg_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_xchg_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -58,21 +58,22 @@ define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; VI-LABEL: global_atomic_xchg_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
@@ -82,7 +83,6 @@ define void @global_atomic_xchg_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_xchg_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -91,25 +91,27 @@ define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -118,7 +120,6 @@ define i64 @global_atomic_xchg_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_xchg_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -127,28 +128,29 @@ define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
@@ -158,7 +160,7 @@ define i64 @global_atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -185,7 +187,6 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_xchg_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -194,11 +195,11 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -206,6 +207,7 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -214,7 +216,7 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -239,7 +241,6 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -250,11 +251,11 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -262,6 +263,7 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
@@ -271,7 +273,7 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -298,7 +300,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_xchg_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -307,11 +308,11 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -319,6 +320,7 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -327,7 +329,7 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -352,7 +354,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -363,11 +364,11 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -375,6 +376,7 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, i64 %in seq_cst
@@ -447,7 +449,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -461,18 +462,20 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
 ;
 ; VI-LABEL: global_atomic_xchg_f64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
   ret void
@@ -544,7 +547,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -558,21 +560,22 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %
 ;
 ; VI-LABEL: global_atomic_xchg_f64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
@@ -647,7 +650,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -656,25 +658,27 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
 ; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_f64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
   ret double %result
@@ -750,7 +754,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -759,28 +762,29 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %
 ; SI-NEXT:    buffer_atomic_swap_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xchg_f64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
@@ -864,7 +868,7 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -891,7 +895,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_xchg_f64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -900,11 +903,11 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -912,6 +915,7 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
   ret void
@@ -998,7 +1002,7 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1023,7 +1027,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -1034,11 +1037,11 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1046,6 +1049,7 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(1) %out, i32 4
   %tmp0 = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
@@ -1132,7 +1136,7 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1159,7 +1163,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_xchg_f64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1168,11 +1171,11 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1180,6 +1183,7 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(1) %ptr, double %in seq_cst
   ret double %result
@@ -1269,7 +1273,7 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
 ; GCN3-NEXT:    s_setpc_b64 s[30:31]
 ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1294,7 +1298,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -1305,11 +1308,11 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
 ; VI-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1317,6 +1320,7 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr double, ptr addrspace(1) %out, i32 4
   %result = atomicrmw xchg ptr addrspace(1) %gep, double %in seq_cst
@@ -1330,7 +1334,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace
 define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_add_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1344,18 +1347,20 @@ define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_add_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -1364,7 +1369,6 @@ define void @global_atomic_add_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_add_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1378,21 +1382,22 @@ define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; VI-LABEL: global_atomic_add_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1402,7 +1407,6 @@ define void @global_atomic_add_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_add_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1411,25 +1415,27 @@ define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_add_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1438,7 +1444,6 @@ define i64 @global_atomic_add_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_add_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1447,28 +1452,29 @@ define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_add_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_add_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1478,7 +1484,7 @@ define i64 @global_atomic_add_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_add_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1505,7 +1511,6 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_add_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1514,11 +1519,11 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1526,6 +1531,7 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -1534,7 +1540,7 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_add_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1559,7 +1565,6 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_add_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -1570,11 +1575,11 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1582,6 +1587,7 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1591,7 +1597,7 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_add_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1618,7 +1624,6 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_add_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1627,11 +1632,11 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1639,6 +1644,7 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1647,7 +1653,7 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_add_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1672,7 +1678,6 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_add_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -1683,11 +1688,11 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_add_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1695,6 +1700,7 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw add ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1708,7 +1714,6 @@ define amdgpu_gfx i64 @global_atomic_add_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_sub_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1722,18 +1727,20 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_sub_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -1742,7 +1749,6 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_sub_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1756,21 +1762,22 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; VI-LABEL: global_atomic_sub_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1780,7 +1787,6 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_sub_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1789,25 +1795,27 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_sub_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -1816,7 +1824,6 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_sub_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -1825,28 +1832,29 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_sub_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1856,7 +1864,7 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1883,7 +1891,6 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_sub_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -1892,11 +1899,11 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1904,6 +1911,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -1912,7 +1920,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1937,7 +1945,6 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_sub_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -1948,11 +1955,11 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -1960,6 +1967,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -1969,7 +1977,7 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -1996,7 +2004,6 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_sub_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2005,11 +2012,11 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2017,6 +2024,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2025,7 +2033,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2050,7 +2058,6 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_sub_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -2061,11 +2068,11 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2073,6 +2080,7 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2086,7 +2094,6 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_and_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2100,18 +2107,20 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_and_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -2120,7 +2129,6 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_and_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2134,21 +2142,22 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; VI-LABEL: global_atomic_and_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2158,7 +2167,6 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_and_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2167,25 +2175,27 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_and_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2194,7 +2204,6 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_and_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2203,28 +2212,29 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_and_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2234,7 +2244,7 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_and_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2261,7 +2271,6 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_and_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2270,11 +2279,11 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2282,6 +2291,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -2290,7 +2300,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_and_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2315,7 +2325,6 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_and_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -2326,11 +2335,11 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2338,6 +2347,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2347,7 +2357,7 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_and_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2374,7 +2384,6 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_and_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -2383,11 +2392,11 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2395,6 +2404,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -2403,7 +2413,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_and_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2428,7 +2438,6 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_and_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -2439,11 +2448,11 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2451,6 +2460,7 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst
@@ -2464,7 +2474,6 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_nand_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2475,7 +2484,6 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, v7, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, v6, v2
 ; SI-NEXT:    v_not_b32_e32 v5, v4
 ; SI-NEXT:    v_not_b32_e32 v4, v8
@@ -2487,6 +2495,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -2495,12 +2504,10 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB40_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -2518,6 +2525,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB40_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2526,7 +2534,6 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB40_1: ; %atomicrmw.start
@@ -2544,6 +2551,7 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2556,7 +2564,6 @@ define void @global_atomic_nand_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_nand_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -2567,7 +2574,6 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v4, v7, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, v6, v2
 ; SI-NEXT:    v_not_b32_e32 v5, v4
 ; SI-NEXT:    v_not_b32_e32 v4, v8
@@ -2579,6 +2585,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -2587,12 +2594,10 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB41_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
@@ -2612,6 +2617,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB41_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2620,7 +2626,6 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB41_1: ; %atomicrmw.start
@@ -2638,6 +2643,7 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB41_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2651,7 +2657,6 @@ define void @global_atomic_nand_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_nand_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, v3
 ; SI-NEXT:    v_mov_b32_e32 v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -2667,7 +2672,6 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v1
 ; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, v11, v4
 ; SI-NEXT:    v_and_b32_e32 v1, v10, v5
 ; SI-NEXT:    v_not_b32_e32 v9, v0
@@ -2680,18 +2684,17 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB42_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -2709,6 +2712,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB42_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2719,7 +2723,6 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB42_1: ; %atomicrmw.start
@@ -2737,6 +2740,7 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB42_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2751,7 +2755,6 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_nand_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v4, v3
 ; SI-NEXT:    v_mov_b32_e32 v5, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -2767,7 +2770,6 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v1
 ; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, v11, v4
 ; SI-NEXT:    v_and_b32_e32 v1, v10, v5
 ; SI-NEXT:    v_not_b32_e32 v9, v0
@@ -2780,18 +2782,17 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB43_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_nand_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
@@ -2811,6 +2812,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB43_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2819,7 +2821,6 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB43_1: ; %atomicrmw.start
@@ -2837,6 +2838,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB43_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2852,7 +2854,7 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -2899,7 +2901,6 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_nand_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -2921,6 +2922,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB44_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2929,7 +2931,6 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -2948,6 +2949,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB44_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2960,7 +2962,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3007,7 +3009,6 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_nand_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
@@ -3029,6 +3030,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB45_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3037,7 +3039,6 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -3056,6 +3057,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB45_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3069,7 +3071,7 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3118,7 +3120,6 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_nand_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -3140,6 +3141,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB46_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3148,7 +3150,6 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -3167,6 +3168,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB46_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3179,7 +3181,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3228,7 +3230,6 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_nand_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -3250,6 +3251,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB47_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3258,7 +3260,6 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_nand_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -3277,6 +3278,7 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB47_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3294,7 +3296,6 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_or_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3308,18 +3309,20 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_or_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -3328,7 +3331,6 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_or_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3342,21 +3344,22 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_or_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3366,7 +3369,6 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_or_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3375,25 +3377,27 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_or_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -3402,7 +3406,6 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_or_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3411,28 +3414,29 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_or_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3442,7 +3446,7 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_or_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3469,7 +3473,6 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_or_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3478,11 +3481,11 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3490,6 +3493,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -3498,7 +3502,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_or_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3523,7 +3527,6 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_or_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -3534,11 +3537,11 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3546,6 +3549,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 ; GFX9-NEXT:    global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3555,7 +3559,7 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_or_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3582,7 +3586,6 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 ;
 ; VI-LABEL: global_atomic_or_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3591,11 +3594,11 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3603,6 +3606,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -3611,7 +3615,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_or_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3636,7 +3640,6 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_or_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -3647,11 +3650,11 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3659,6 +3662,7 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 ; GFX9-NEXT:    global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3672,7 +3676,6 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_xor_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3686,18 +3689,20 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_xor_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -3706,7 +3711,6 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_xor_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3720,21 +3724,22 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; VI-LABEL: global_atomic_xor_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3744,7 +3749,6 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_xor_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3753,25 +3757,27 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xor_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -3780,7 +3786,6 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_xor_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -3789,28 +3794,29 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_xor_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3820,7 +3826,7 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3847,7 +3853,6 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_xor_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3856,11 +3861,11 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3868,6 +3873,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -3876,7 +3882,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3901,7 +3907,6 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_xor_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -3912,11 +3917,11 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3924,6 +3929,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -3933,7 +3939,7 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -3960,7 +3966,6 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_xor_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -3969,11 +3974,11 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -3981,6 +3986,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -3989,7 +3995,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4014,7 +4020,6 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_xor_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -4025,11 +4030,11 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -4037,6 +4042,7 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst
@@ -4050,7 +4056,6 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_max_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -4063,7 +4068,6 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -4072,6 +4076,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -4080,12 +4085,10 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB64_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -4102,6 +4105,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB64_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4110,7 +4114,6 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB64_1: ; %atomicrmw.start
@@ -4127,6 +4130,7 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB64_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4139,7 +4143,6 @@ define void @global_atomic_max_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_max_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -4152,7 +4155,6 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -4161,6 +4163,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -4169,12 +4172,10 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB65_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
@@ -4193,6 +4194,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB65_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4201,7 +4203,6 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: global_atomic_max_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB65_1: ; %atomicrmw.start
@@ -4218,6 +4219,7 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB65_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4231,7 +4233,6 @@ define void @global_atomic_max_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_max_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -4250,7 +4251,6 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -4259,18 +4259,17 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB66_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -4287,6 +4286,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB66_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4297,7 +4297,6 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB66_1: ; %atomicrmw.start
@@ -4314,6 +4313,7 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB66_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4328,7 +4328,6 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_max_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -4347,7 +4346,6 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -4356,18 +4354,17 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB67_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_max_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
@@ -4386,6 +4383,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB67_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4394,7 +4392,6 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_max_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB67_1: ; %atomicrmw.start
@@ -4411,6 +4408,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB67_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4426,7 +4424,7 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_max_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4474,7 +4472,6 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_max_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -4497,6 +4494,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB68_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4505,7 +4503,6 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; GFX9-LABEL: global_atomic_max_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4525,6 +4522,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB68_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4537,7 +4535,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_max_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4585,7 +4583,6 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_max_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
@@ -4608,6 +4605,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB69_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4616,7 +4614,6 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; GFX9-LABEL: global_atomic_max_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4636,6 +4633,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB69_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4649,7 +4647,7 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_max_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4699,7 +4697,6 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_max_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -4722,6 +4719,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB70_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4730,7 +4728,6 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; GFX9-LABEL: global_atomic_max_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4750,6 +4747,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB70_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4762,7 +4760,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_max_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -4812,7 +4810,6 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_max_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -4835,6 +4832,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB71_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4843,7 +4841,6 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_max_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -4863,6 +4860,7 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB71_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5368,7 +5366,6 @@ entry:
 define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_umax_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -5381,7 +5378,6 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -5390,6 +5386,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -5398,12 +5395,10 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB76_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -5420,6 +5415,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB76_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5428,7 +5424,6 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB76_1: ; %atomicrmw.start
@@ -5445,6 +5440,7 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB76_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5457,7 +5453,6 @@ define void @global_atomic_umax_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_umax_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -5470,7 +5465,6 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -5479,6 +5473,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -5487,12 +5482,10 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB77_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
@@ -5511,6 +5504,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB77_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5519,7 +5513,6 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB77_1: ; %atomicrmw.start
@@ -5536,6 +5529,7 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB77_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5549,7 +5543,6 @@ define void @global_atomic_umax_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_umax_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -5568,7 +5561,6 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -5577,18 +5569,17 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB78_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -5605,6 +5596,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB78_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5615,7 +5607,6 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB78_1: ; %atomicrmw.start
@@ -5632,6 +5623,7 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB78_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5646,7 +5638,6 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_umax_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -5665,7 +5656,6 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -5674,18 +5664,17 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB79_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umax_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
@@ -5704,6 +5693,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB79_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5712,7 +5702,6 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB79_1: ; %atomicrmw.start
@@ -5729,6 +5718,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB79_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5744,7 +5734,7 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5792,7 +5782,6 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_umax_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -5815,6 +5804,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB80_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5823,7 +5813,6 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5843,6 +5832,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB80_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5855,7 +5845,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -5903,7 +5893,6 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_umax_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
@@ -5926,6 +5915,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB81_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5934,7 +5924,6 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -5954,6 +5943,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB81_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5967,7 +5957,7 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6017,7 +6007,6 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_umax_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -6040,6 +6029,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB82_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6048,7 +6038,6 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6068,6 +6057,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB82_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6080,7 +6070,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6130,7 +6120,6 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_umax_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -6153,6 +6142,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB83_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6161,7 +6151,6 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_umax_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -6181,6 +6170,7 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB83_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6571,7 +6561,6 @@ entry:
 define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_umin_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6584,7 +6573,6 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -6593,6 +6581,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -6601,12 +6590,10 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB87_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -6623,6 +6610,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB87_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6631,7 +6619,6 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
@@ -6648,6 +6635,7 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6660,7 +6648,6 @@ define void @global_atomic_umin_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_umin_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -6673,7 +6660,6 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -6682,6 +6668,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -6690,12 +6677,10 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB88_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
@@ -6714,6 +6699,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB88_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6722,7 +6708,6 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
@@ -6739,6 +6724,7 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6752,7 +6738,6 @@ define void @global_atomic_umin_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_umin_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -6771,7 +6756,6 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -6780,18 +6764,17 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB89_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -6808,6 +6791,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB89_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6818,7 +6802,6 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
@@ -6835,6 +6818,7 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6849,7 +6833,6 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_umin_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -6868,7 +6851,6 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -6877,18 +6859,17 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB90_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_umin_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
@@ -6907,6 +6888,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB90_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6915,7 +6897,6 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
@@ -6932,6 +6913,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6947,7 +6929,7 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -6995,7 +6977,6 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; VI-LABEL: global_atomic_umin_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -7018,6 +6999,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB91_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7026,7 +7008,6 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7046,6 +7027,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7058,7 +7040,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7106,7 +7088,6 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ;
 ; VI-LABEL: global_atomic_umin_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
@@ -7129,6 +7110,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB92_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7137,7 +7119,6 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7157,6 +7138,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB92_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7170,7 +7152,7 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7220,7 +7202,6 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; VI-LABEL: global_atomic_umin_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -7243,6 +7224,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB93_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7251,7 +7233,6 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7271,6 +7252,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7283,7 +7265,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7333,7 +7315,6 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_umin_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -7356,6 +7337,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB94_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7364,7 +7346,6 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_umin_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7384,6 +7365,7 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB94_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7401,7 +7383,6 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_min_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7414,7 +7395,6 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -7423,6 +7403,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -7431,12 +7412,10 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_cbranch_execnz .LBB95_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -7453,6 +7432,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB95_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7461,7 +7441,6 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB95_1: ; %atomicrmw.start
@@ -7478,6 +7457,7 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB95_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7490,7 +7470,6 @@ define void @global_atomic_min_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_min_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -7503,7 +7482,6 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, v7
 ; SI-NEXT:    v_mov_b32_e32 v10, v6
 ; SI-NEXT:    v_mov_b32_e32 v9, v5
@@ -7512,6 +7490,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v6, v8
@@ -7520,12 +7499,10 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; SI-NEXT:    s_cbranch_execnz .LBB96_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
@@ -7544,6 +7521,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB96_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7552,7 +7530,6 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ;
 ; GFX9-LABEL: global_atomic_min_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB96_1: ; %atomicrmw.start
@@ -7569,6 +7546,7 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB96_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7582,7 +7560,6 @@ define void @global_atomic_min_i64_noret_offset(ptr addrspace(1) %out, i64 %in)
 define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_min_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -7601,7 +7578,6 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -7610,18 +7586,17 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB97_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -7638,6 +7613,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB97_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7648,7 +7624,6 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB97_1: ; %atomicrmw.start
@@ -7665,6 +7640,7 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB97_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7679,7 +7655,6 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_min_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, v3
 ; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
@@ -7698,7 +7673,6 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v8
 ; SI-NEXT:    v_mov_b32_e32 v1, v9
 ; SI-NEXT:    v_mov_b32_e32 v2, v10
@@ -7707,18 +7681,17 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB98_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_min_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[4:5]
@@ -7737,6 +7710,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB98_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7745,7 +7719,6 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ;
 ; GFX9-LABEL: global_atomic_min_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:32
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
@@ -7762,6 +7735,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB98_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7777,7 +7751,7 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_min_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7825,7 +7799,6 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; VI-LABEL: global_atomic_min_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
@@ -7848,6 +7821,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB99_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7856,7 +7830,6 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ;
 ; GFX9-LABEL: global_atomic_min_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7876,6 +7849,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB99_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7888,7 +7862,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_min_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -7936,7 +7910,6 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; VI-LABEL: global_atomic_min_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
@@ -7959,6 +7932,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB100_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7967,7 +7941,6 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ;
 ; GFX9-LABEL: global_atomic_min_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -7987,6 +7960,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8000,7 +7974,7 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_min_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8050,7 +8024,6 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; VI-LABEL: global_atomic_min_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
@@ -8073,6 +8046,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB101_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8081,7 +8055,6 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ;
 ; GFX9-LABEL: global_atomic_min_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -8101,6 +8074,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8113,7 +8087,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_min_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8163,7 +8137,6 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; VI-LABEL: global_atomic_min_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -8186,6 +8159,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; VI-NEXT:    buffer_wbinvl1_vol
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; VI-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; VI-NEXT:    s_cbranch_execnz .LBB102_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8194,7 +8168,6 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ;
 ; GFX9-LABEL: global_atomic_min_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:32
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
@@ -8214,6 +8187,7 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8708,7 +8682,6 @@ entry:
 define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8722,18 +8695,20 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -8742,7 +8717,6 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8756,21 +8730,22 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -8780,7 +8755,6 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
 define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8789,25 +8763,27 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -8816,7 +8792,6 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -8825,28 +8800,29 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -8856,7 +8832,7 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8883,7 +8859,6 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -8892,11 +8867,11 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -8904,6 +8879,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 ; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -8912,7 +8888,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8937,7 +8913,6 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -8948,11 +8923,11 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -8960,6 +8935,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 ; GFX9-NEXT:    global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -8969,7 +8945,7 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -8996,7 +8972,6 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -9005,11 +8980,11 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9017,6 +8992,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -9025,7 +9001,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -9050,7 +9026,6 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 ;
 ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -9061,11 +9036,11 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9073,6 +9048,7 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 ; GFX9-NEXT:    global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9086,7 +9062,6 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_noret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -9100,18 +9075,20 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_noret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -9120,7 +9097,6 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) {
 define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -9134,21 +9110,22 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[2:3], off offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9158,7 +9135,6 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64
 define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_ret:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -9167,25 +9143,27 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_ret:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -9194,7 +9172,6 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -9203,28 +9180,29 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9234,7 +9212,7 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -9261,7 +9239,6 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -9270,11 +9247,11 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9282,6 +9259,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 ; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret void
@@ -9290,7 +9268,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -9315,7 +9293,6 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -9326,11 +9303,11 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9338,6 +9315,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 ; GFX9-NEXT:    global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
@@ -9347,7 +9325,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) inreg %ptr, i64 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -9374,7 +9352,6 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
@@ -9383,11 +9360,11 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9395,6 +9372,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst
   ret i64 %result
@@ -9403,7 +9381,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
 define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspace(1) inreg %out, i64 inreg %in) {
 ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
@@ -9428,7 +9406,6 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
 ;
 ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s34, s4, 32
 ; VI-NEXT:    s_addc_u32 s35, s5, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s34
@@ -9439,11 +9416,11 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -9451,6 +9428,7 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
 ; GFX9-NEXT:    global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
   %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
index e1d253db9fce1c0..c9034d74793f58e 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll
@@ -18,33 +18,31 @@
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; FORWARDXNACK-LABEL: shuffle_v4f16_234u:
 ; FORWARDXNACK:       ; %bb.0:
-; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FORWARDXNACK-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; FORWARDXNACK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(1)
 ; FORWARDXNACK-NEXT:    v_mov_b32_e32 v0, v6
 ; FORWARDXNACK-NEXT:    s_waitcnt vmcnt(0)
 ; FORWARDXNACK-NEXT:    v_mov_b32_e32 v1, v4
+; FORWARDXNACK-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FORWARDXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; REVERSEXNACK-LABEL: shuffle_v4f16_234u:
 ; REVERSEXNACK:       ; %bb.0:
-; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; REVERSEXNACK-NEXT:    v_mov_b32_e32 v6, v3
 ; REVERSEXNACK-NEXT:    v_mov_b32_e32 v4, v1
 ; REVERSEXNACK-NEXT:    v_mov_b32_e32 v3, v0
 ; REVERSEXNACK-NEXT:    v_mov_b32_e32 v5, v2
 ; REVERSEXNACK-NEXT:    global_load_dword v0, v[3:4], off offset:4
 ; REVERSEXNACK-NEXT:    global_load_dwordx2 v[1:2], v[5:6], off
-; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0)
+; REVERSEXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; REVERSEXNACK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; NOXNACK-LABEL: shuffle_v4f16_234u:
 ; NOXNACK:       ; %bb.0:
-; NOXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOXNACK-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; NOXNACK-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
-; NOXNACK-NEXT:    s_waitcnt vmcnt(0)
+; NOXNACK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOXNACK-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index ffbf00765adbe22..52cd6bc506eea54 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -4,7 +4,7 @@
 define void @main(i1 %arg) #0 {
 ; CHECK-LABEL: main:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index dcc615232e56bed..d4aca22f0384b0d 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -1836,29 +1836,27 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x)
 define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_0.5_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_0.5_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_0.5_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0x3800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x38,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_0.5_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1866,7 +1864,7 @@ define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0x3800, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 0.5 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -1876,29 +1874,27 @@ define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_0.5_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_neg_0.5_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_neg_0.5_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0xb800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xb8,0xff,0xff]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_neg_0.5_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1906,7 +1902,7 @@ define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0xb800, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -0.5 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -1916,29 +1912,27 @@ define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_1.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_1.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_1.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0x3c00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x3c,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_1.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1946,7 +1940,7 @@ define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0x3c00, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 1.0 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -1956,29 +1950,27 @@ define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_1.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_neg_1.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_neg_1.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0xbc00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xbc,0xff,0xff]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_neg_1.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -1986,7 +1978,7 @@ define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0xbc00, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -1.0 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -1996,37 +1988,35 @@ define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @shl_inline_imm_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: shl_inline_imm_2.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: shl_inline_imm_2.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: shl_inline_imm_2.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_movk_i32 s4, 0x4000 ; encoding: [0x00,0x40,0x04,0xb0]
 ; VI-NEXT:    v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: shl_inline_imm_2.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_lshl_b32_e32 v2, 0x4000, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i16 bitcast (half 2.0 to i16), %x
   store i16 %y, ptr addrspace(1) %out
@@ -2036,37 +2026,35 @@ define void @shl_inline_imm_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @shl_inline_imm_neg_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: shl_inline_imm_neg_2.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: shl_inline_imm_neg_2.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: shl_inline_imm_neg_2.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_movk_i32 s4, 0xc000 ; encoding: [0x00,0xc0,0x04,0xb0]
 ; VI-NEXT:    v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: shl_inline_imm_neg_2.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_lshl_b32_e32 v2, 0xffffc000, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i16 bitcast (half -2.0 to i16), %x
   store i16 %y, ptr addrspace(1) %out
@@ -2076,29 +2064,27 @@ define void @shl_inline_imm_neg_2.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_4.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_4.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_4.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0x4400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x44,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_4.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2106,7 +2092,7 @@ define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0x4400, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 4.0 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -2116,29 +2102,27 @@ define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_neg_4.0_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_neg_4.0_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_neg_4.0_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0xc400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xc4,0xff,0xff]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_neg_4.0_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2146,7 +2130,7 @@ define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0xc400, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half -4.0 to i16)
   store i16 %y, ptr addrspace(1) %out
@@ -2156,29 +2140,27 @@ define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
 define void @mul_inline_imm_inv2pi_i16(ptr addrspace(1) %out, i16 %x) {
 ; GFX10-LABEL: mul_inline_imm_inv2pi_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
 ;
 ; GFX11-LABEL: mul_inline_imm_inv2pi_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 ;
 ; VI-LABEL: mul_inline_imm_inv2pi_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    v_mul_lo_u16_e32 v2, 0x3118, v2 ; encoding: [0xff,0x04,0x04,0x52,0x18,0x31,0x00,0x00]
 ; VI-NEXT:    flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00]
-; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; VI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: mul_inline_imm_inv2pi_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2186,7 +2168,7 @@ define void @mul_inline_imm_inv2pi_i16(ptr addrspace(1) %out, i16 %x) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_mul_u32_u24_e32 v2, 0x3118, v2
 ; SI-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
   %y = mul i16 %x, bitcast (half 0xH3118 to i16)
   store i16 %y, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3aaf04c94cda586..1382cd8c23b9e22 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -120,9 +120,9 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -204,9 +204,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s16, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
@@ -292,9 +292,9 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -379,9 +379,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s16, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
@@ -468,9 +468,9 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_ret:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -554,9 +554,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_ret:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s16, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
@@ -645,9 +645,9 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GCN:       ; %bb.0: ; %bb0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -735,14 +735,14 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_in_branch:
 ; GISEL:       ; %bb.0: ; %bb0
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s16, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[18:19]
@@ -830,7 +830,7 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb0:
   br i1 %cond, label %bb1, label %bb2
@@ -846,9 +846,9 @@ bb2:
 define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s5, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
@@ -940,9 +940,9 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s5, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
@@ -1038,9 +1038,9 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1136,9 +1136,9 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1242,9 +1242,9 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1338,9 +1338,9 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1439,9 +1439,9 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1532,9 +1532,9 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ;
 ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 220ea962b9e1dca..3b4cf470c0a0735 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -6,9 +6,9 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define void @f0() {
 ; GFX11-LABEL: f0:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -39,11 +39,11 @@ bb:
 define <2 x i64> @f1() #0 {
 ; GFX11-LABEL: f1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <2 x i64> zeroinitializer
 }
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 3bc503e3714fe5b..6c9dca13c1508d8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -2004,7 +2004,6 @@ entry:
 define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
 ; GCN-LABEL: double8_inselt_vec:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
 ; GCN-NEXT:    v_mov_b32_e32 v17, 0x3ff00000
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -2030,6 +2029,7 @@ define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
 ; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 68427e8937bb946..00ea97f2902a632 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -381,12 +381,11 @@ define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x flo
 define <4 x float> @insertelement_to_sgpr() nounwind {
 ; GCN-LABEL: insertelement_to_sgpr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, 0
 ; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <4 x i32>, ptr addrspace(4) undef
   %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
@@ -397,7 +396,6 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
 define <9 x float> @insertelement_to_v9f32_undef() nounwind {
 ; GCN-LABEL: insertelement_to_v9f32_undef:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40a00000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0xc0a00000
@@ -411,6 +409,7 @@ define <9 x float> @insertelement_to_v9f32_undef() nounwind {
 ; GCN-NEXT:    v_mov_b32_e32 v6, s10
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <9 x float>, ptr addrspace(4) undef
   %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
@@ -422,7 +421,6 @@ define <9 x float> @insertelement_to_v9f32_undef() nounwind {
 define <10 x float> @insertelement_to_v10f32_undef() nounwind {
 ; GCN-LABEL: insertelement_to_v10f32_undef:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -437,6 +435,7 @@ define <10 x float> @insertelement_to_v10f32_undef() nounwind {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, s12
 ; GCN-NEXT:    v_mov_b32_e32 v9, s13
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <10 x float>, ptr addrspace(4) undef
   %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
@@ -446,7 +445,6 @@ define <10 x float> @insertelement_to_v10f32_undef() nounwind {
 define <11 x float> @insertelement_to_v11f32_undef() nounwind {
 ; GCN-LABEL: insertelement_to_v11f32_undef:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -462,6 +460,7 @@ define <11 x float> @insertelement_to_v11f32_undef() nounwind {
 ; GCN-NEXT:    v_mov_b32_e32 v8, s12
 ; GCN-NEXT:    v_mov_b32_e32 v9, s13
 ; GCN-NEXT:    v_mov_b32_e32 v10, s14
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <11 x float>, ptr addrspace(4) undef
   %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
@@ -471,7 +470,6 @@ define <11 x float> @insertelement_to_v11f32_undef() nounwind {
 define <12 x float> @insertelement_to_v12f32_undef() nounwind {
 ; GCN-LABEL: insertelement_to_v12f32_undef:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 4.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -488,6 +486,7 @@ define <12 x float> @insertelement_to_v12f32_undef() nounwind {
 ; GCN-NEXT:    v_mov_b32_e32 v9, s13
 ; GCN-NEXT:    v_mov_b32_e32 v10, s14
 ; GCN-NEXT:    v_mov_b32_e32 v11, s15
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <12 x float>, ptr addrspace(4) undef
   %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 61017e809c86365..bb19d4260529f30 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -22,7 +22,6 @@
 define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i32:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -31,11 +30,11 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i32:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -44,11 +43,11 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i32:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -57,11 +56,11 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i32:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -70,22 +69,22 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i32:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v2, v[0:1]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[1:2]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i32:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -94,22 +93,22 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i32:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[0:1]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i32:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -118,22 +117,22 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v0, v2, v[0:1]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i32:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
@@ -142,6 +141,7 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add i32 %x, 1
@@ -158,7 +158,6 @@ entry:
 define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -176,11 +175,11 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -198,21 +197,21 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -222,21 +221,21 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX8-GISEL-NEXT:    v_mad_u16 v1, v3, v2, 1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -246,21 +245,21 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v1, v3, v2, 1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
@@ -270,6 +269,7 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i16 %x, 1
@@ -286,7 +286,6 @@ entry:
 define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
@@ -315,11 +314,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
@@ -366,11 +365,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -387,11 +386,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v3, v1, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -409,11 +408,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -422,11 +421,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -435,11 +434,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -448,11 +447,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -461,6 +460,7 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -477,7 +477,6 @@ entry:
 define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff, v0
@@ -528,11 +527,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v5, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v1, v4
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v3, v2
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v3i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v0
@@ -578,11 +577,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v8
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v4, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -605,11 +604,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_mad_u16 v2, v4, v2, v4
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v5, v1, v5
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v3i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, 1, v0
@@ -634,11 +633,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -655,11 +654,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v3i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -676,11 +675,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
@@ -697,11 +696,11 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v3i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -718,6 +717,7 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -734,7 +734,6 @@ entry:
 define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v11, 0xffff, v3
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v7, 0xffff, v7
@@ -806,11 +805,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v4, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
@@ -903,11 +902,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v4
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v3, v3, v5
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v5, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -939,11 +938,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v2, v6, v3, v6
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v4i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -977,11 +976,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v6, v3
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -998,11 +997,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v4i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1019,11 +1018,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
@@ -1040,11 +1039,11 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v4i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1061,6 +1060,7 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -1077,7 +1077,6 @@ entry:
 define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v0, v1
@@ -1094,11 +1093,11 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
@@ -1115,20 +1114,20 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -1137,20 +1136,20 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mad_u16 v1, v3, v2, 1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -1159,21 +1158,21 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v1, v3, v2, 1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
@@ -1183,6 +1182,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i16 %x, 1
@@ -1199,7 +1199,6 @@ entry:
 define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
@@ -1228,11 +1227,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
@@ -1279,11 +1278,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1300,11 +1299,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v3, v1, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_v2i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1322,11 +1321,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -1335,11 +1334,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -1348,11 +1347,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -1361,11 +1360,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -1374,6 +1373,7 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -1390,7 +1390,6 @@ entry:
 define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v6, 0xffff, v0
@@ -1441,11 +1440,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v5, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v1, v4
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v3, v2
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v3i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v0
@@ -1491,11 +1490,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v8
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v4, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1518,11 +1517,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_mad_u16 v2, v4, v2, v4
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v5, v1, v5
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_v3i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v5, 1, v0
@@ -1547,11 +1546,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1568,11 +1567,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_v3i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1589,11 +1588,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
@@ -1610,11 +1609,11 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_v3i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1631,6 +1630,7 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -1647,7 +1647,6 @@ entry:
 define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v11, 0xffff, v3
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v7, 0xffff, v7
@@ -1719,11 +1718,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v1, v5
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v4, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v3, v3, v6
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v4i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
@@ -1816,11 +1815,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v4
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v3, v3, v5
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v5, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1852,11 +1851,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v2, v6, v3, v6
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_v4i16:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v4, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1890,11 +1889,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v6, v3
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1911,11 +1910,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_v4i16:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1932,11 +1931,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v4i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
@@ -1953,11 +1952,11 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_v4i16:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
@@ -1974,6 +1973,7 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i16> %x, <i16 1, i16 1, i16 1, i16 1>
@@ -1990,7 +1990,6 @@ entry:
 define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i32:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2007,11 +2006,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i32:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2028,11 +2027,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i32:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2049,11 +2048,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i32:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2070,11 +2069,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_v2i32:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
@@ -2087,11 +2086,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v5, v[2:3]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, v[3:4]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v2, v[4:5]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i32:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2108,11 +2107,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_v2i32:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -2126,11 +2125,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, v[6:7]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5]
 ; GFX90A-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i32:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2147,11 +2146,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2164,11 +2163,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[4:5], null, v2, v5, v[2:3]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v0, v[3:4]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i32:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -2185,6 +2184,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i32> %x, <i32 1, i32 1>
@@ -2201,7 +2201,6 @@ entry:
 define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v3i32:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -2226,11 +2225,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v3i32:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
@@ -2255,11 +2254,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v7
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v2, v3, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v3i32:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
@@ -2284,11 +2283,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v3i32:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
@@ -2313,11 +2312,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v7
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v2, v3, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_v3i32:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v2, 1, v2
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v6, v2, v5
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -2336,11 +2335,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, v[6:7]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v2, v[7:8]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v5, v[3:4]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v3i32:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v2, 1, v2
@@ -2365,11 +2364,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v7
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v2, v3, v2
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_v3i32:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v2, 1, v2
@@ -2389,11 +2388,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v8, v2, v[8:9]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v4, v[6:7]
 ; GFX90A-SDAG-NEXT:    v_mov_b32_e32 v1, v8
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v3i32:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v2, 1, v2
@@ -2418,11 +2417,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v6
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v7
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v8
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v2
@@ -2441,11 +2440,11 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v0, v[4:5]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v2, v[5:6]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[2:3], null, v6, v3, v[6:7]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v3i32:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
@@ -2470,6 +2469,7 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v3, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v5, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i32> %x, <i32 1, i32 1, i32 1>
@@ -2486,7 +2486,6 @@ entry:
 define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v4i32:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
@@ -2519,11 +2518,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i32:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
@@ -2556,11 +2555,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v10
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v11
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v4i32:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v3, vcc, 1, v3
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
@@ -2593,11 +2592,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v7
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v4i32:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v2, vcc, 1, v2
@@ -2630,11 +2629,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v10
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v11
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_v4i32:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v3, 1, v3
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v2, 1, v2
@@ -2659,11 +2658,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v2, v[8:9]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[6:7]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v4, v[5:6]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v4i32:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, 1, v2
@@ -2696,11 +2695,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX9-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v9
 ; GFX9-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v10
 ; GFX9-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v11
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_v4i32:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v3, 1, v3
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v2, 1, v2
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
@@ -2727,11 +2726,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v6, v[8:9]
 ; GFX90A-SDAG-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX90A-SDAG-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v2, 1, v2
@@ -2756,11 +2755,11 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v2, v[6:7]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[2:3], null, v7, v3, v[7:8]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[3:4], null, v8, v4, v[8:9]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v4i32:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, 1, v2
@@ -2793,6 +2792,7 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v4, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v5, v2
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v3, v6, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
@@ -2809,7 +2809,6 @@ entry:
 define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i24:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX67-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -2820,11 +2819,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i24:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX67-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -2835,11 +2834,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i24:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
@@ -2850,11 +2849,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i24:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
@@ -2865,11 +2864,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i24:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX900-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -2878,11 +2877,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v2, v[0:1]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[1:2]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i24:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX900-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -2893,11 +2892,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i24:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX90A-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -2906,11 +2905,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[0:1]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i24:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX90A-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -2921,11 +2920,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i24:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -2934,11 +2933,11 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v0, v2, v[0:1]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i24:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 24
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -2949,6 +2948,7 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %shl = shl i32 %x, 8
@@ -2969,7 +2969,6 @@ entry:
 define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_u24:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -2980,11 +2979,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_u24:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
@@ -2995,11 +2994,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_u24:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
@@ -3010,11 +3009,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_u24:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
@@ -3025,11 +3024,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_u24:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX900-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -3038,11 +3037,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v2, v[0:1]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[1:2]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_u24:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -3053,11 +3052,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_u24:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX90A-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -3066,11 +3065,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[0:1]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_u24:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX90A-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
@@ -3081,11 +3080,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_u24:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -3094,11 +3093,11 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v0, v2, v[0:1]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_u24:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -3109,6 +3108,7 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %shl = and i32 %x, 16777215
@@ -3127,7 +3127,6 @@ entry:
 define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i8:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v1
@@ -3145,11 +3144,11 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i8:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v1
@@ -3167,21 +3166,21 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i8:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -3191,21 +3190,21 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX8-GISEL-NEXT:    v_mad_u16 v1, v3, v2, 1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_i8:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -3215,21 +3214,21 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v1, v3, v2, 1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i8:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
@@ -3239,6 +3238,7 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv33 = add i8 %x, 1
@@ -3255,7 +3255,6 @@ entry:
 define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i8:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
@@ -3292,11 +3291,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i8:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xff, v0
@@ -3327,11 +3326,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i8:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v1, 1, v1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u16_e32 v4, v1, v3
@@ -3347,11 +3346,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v2, v0, v2
 ; GFX8-SDAG-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i8:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v1, 1, v1
 ; GFX8-GISEL-NEXT:    v_mad_u16 v4, v0, v2, v0
@@ -3366,11 +3365,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v7, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i8:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_u16_e32 v1, 1, v1
 ; GFX9-SDAG-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-SDAG-NEXT:    v_mul_lo_u16_e32 v4, v1, v3
@@ -3386,11 +3385,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v2, v0, v2
 ; GFX9-SDAG-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i8:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v1, 1, v1
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v4, v0, v2, v0
@@ -3405,11 +3404,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v7, v1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i8:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-SDAG-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v4, v1, v3, v1
@@ -3425,11 +3424,11 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX10-SDAG-NEXT:    v_lshlrev_b16 v2, 8, v1
 ; GFX10-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-SDAG-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i8:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v4, v0, v2
@@ -3446,6 +3445,7 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i8> %x, <i8 1, i8 1>
@@ -3462,7 +3462,6 @@ entry:
 define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX6-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX6-SDAG:       ; %bb.0: ; %entry
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GFX6-SDAG-NEXT:    v_mul_hi_u32 v5, v0, v2
@@ -3495,11 +3494,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX6-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX6-GISEL:       ; %bb.0: ; %entry
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-GISEL-NEXT:    v_mul_lo_u32 v4, v1, v2
@@ -3532,11 +3531,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX6-GISEL-NEXT:    v_mul_hi_u32 v3, v6, v3
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX7-SDAG:       ; %bb.0: ; %entry
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX7-SDAG-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX7-SDAG-NEXT:    v_mul_lo_u32 v6, v4, v3
@@ -3561,11 +3560,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX7-SDAG-NEXT:    v_mul_lo_u32 v3, v5, v4
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX7-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
@@ -3588,11 +3587,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v6, v4, v3
@@ -3617,11 +3616,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v3, v5, v4
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
@@ -3644,11 +3643,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v0
 ; GFX900-SDAG-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v6, v5, v2
@@ -3669,11 +3668,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v5, v4, v1
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v1, v[3:4]
 ; GFX900-SDAG-NEXT:    v_add3_u32 v1, v5, v1, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
 ; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
@@ -3696,11 +3695,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
 ; GFX90A-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v6, v1, v2
@@ -3721,11 +3720,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v5, v3, v0
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3]
 ; GFX90A-SDAG-NEXT:    v_add3_u32 v1, v5, v1, v4
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
 ; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
@@ -3750,11 +3749,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
 ; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i64:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v7, v4, v3
@@ -3775,11 +3774,11 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v4, v1, v3
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v3, v[0:1]
 ; GFX10-SDAG-NEXT:    v_add3_u32 v1, v4, v1, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
@@ -3804,6 +3803,7 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v2, v5
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v3, v3, v4
 ; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add i64 %x, 1
@@ -3820,7 +3820,6 @@ entry:
 define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX6-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX6-SDAG:       ; %bb.0: ; %entry
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-SDAG-NEXT:    v_mul_lo_u32 v8, v0, v5
@@ -3885,11 +3884,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GFX6-SDAG-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX6-GISEL:       ; %bb.0: ; %entry
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX6-GISEL-NEXT:    v_mul_lo_u32 v8, v1, v4
@@ -3954,11 +3953,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX6-GISEL-NEXT:    v_mul_hi_u32 v3, v3, v9
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX7-SDAG:       ; %bb.0: ; %entry
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
 ; GFX7-SDAG-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GFX7-SDAG-NEXT:    v_mul_lo_u32 v10, v8, v5
@@ -4007,11 +4006,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v0
 ; GFX7-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
@@ -4059,11 +4058,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v8, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v10, v8, v5
@@ -4112,11 +4111,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v2
@@ -4164,11 +4163,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v2
 ; GFX900-SDAG-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
 ; GFX900-SDAG-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
@@ -4209,11 +4208,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v5, v[7:8]
 ; GFX900-SDAG-NEXT:    v_add3_u32 v1, v9, v1, v4
 ; GFX900-SDAG-NEXT:    v_add3_u32 v3, v10, v3, v6
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
 ; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
@@ -4261,11 +4260,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
 ; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
 ; GFX90A-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
@@ -4306,11 +4305,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5]
 ; GFX90A-SDAG-NEXT:    v_add3_u32 v1, v9, v1, v8
 ; GFX90A-SDAG-NEXT:    v_add3_u32 v3, v7, v3, v6
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
 ; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
 ; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
@@ -4359,11 +4358,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
 ; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
@@ -4404,11 +4403,11 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[2:3], null, v7, v5, v[7:8]
 ; GFX10-SDAG-NEXT:    v_add3_u32 v1, v9, v1, v4
 ; GFX10-SDAG-NEXT:    v_add3_u32 v3, v10, v3, v6
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v10, vcc_lo, v2, 1
@@ -4457,6 +4456,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v5, v5, v12
 ; GFX10-GISEL-NEXT:    v_add3_u32 v1, v1, v4, v7
 ; GFX10-GISEL-NEXT:    v_add3_u32 v3, v3, v6, v5
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i64> %x, <i64 1, i64 1>
@@ -4473,7 +4473,6 @@ entry:
 define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(1) %ptr) {
 ; GFX6-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX6-SDAG:       ; %bb.0: ; %bb
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -4489,14 +4488,13 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-SDAG-NEXT:    buffer_store_dword v5, v[3:4], s[4:7], 0 addr64
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; GFX6-SDAG-NEXT:    s_waitcnt expcnt(0)
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX6-GISEL:       ; %bb.0: ; %bb
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
@@ -4511,14 +4509,13 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-GISEL-NEXT:    buffer_store_dword v5, v[3:4], s[4:7], 0 addr64
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; GFX6-GISEL-NEXT:    s_waitcnt expcnt(0)
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX7-SDAG:       ; %bb.0: ; %bb
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX7-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -4536,11 +4533,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX7-SDAG-NEXT:    buffer_store_dword v5, v[3:4], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
+; GFX7-SDAG-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX7-GISEL:       ; %bb.0: ; %bb
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
@@ -4557,11 +4554,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX7-GISEL-NEXT:    buffer_store_dword v5, v[3:4], s[4:7], 0 addr64
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
+; GFX7-GISEL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX8:       ; %bb.0: ; %bb
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -4575,11 +4572,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX8-NEXT:    flat_store_dword v[3:4], v5
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4593,11 +4590,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX900-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_u32_e32 v0, v5, v1
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX90A-SDAG:       ; %bb.0: ; %bb
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4613,11 +4610,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX90A-SDAG-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX90A-SDAG-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX90A-GISEL:       ; %bb.0: ; %bb
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v3, v4
@@ -4633,11 +4630,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX90A-GISEL-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX90A-GISEL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_multi_use_mul_chain_add_other_use_all:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
@@ -4651,6 +4648,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar
 ; GFX10-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = add i32 %arg, 1
@@ -4669,7 +4667,6 @@ bb:
 define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(1) %ptr) {
 ; GFX6-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX6-SDAG:       ; %bb.0: ; %bb
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -4685,11 +4682,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX6-GISEL:       ; %bb.0: ; %bb
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-GISEL-NEXT:    s_mov_b32 s6, 0
@@ -4704,11 +4701,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX6-GISEL-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX7-SDAG:       ; %bb.0: ; %bb
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX7-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX7-SDAG-NEXT:    s_mov_b32 s6, 0
@@ -4724,11 +4721,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-SDAG-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX7-GISEL:       ; %bb.0: ; %bb
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX7-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX7-GISEL-NEXT:    s_mov_b32 s6, 0
@@ -4743,11 +4740,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX7-GISEL-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-GISEL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX8:       ; %bb.0: ; %bb
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -4759,11 +4756,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX8-NEXT:    flat_store_dword v[3:4], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX900:       ; %bb.0: ; %bb
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4775,11 +4772,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX900-NEXT:    global_store_dword v[3:4], v0, off
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX90A-SDAG:       ; %bb.0: ; %bb
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4793,11 +4790,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX90A-SDAG-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX90A-SDAG-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX90A-GISEL:       ; %bb.0: ; %bb
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX90A-GISEL-NEXT:    v_mov_b32_e32 v3, v4
@@ -4811,11 +4808,11 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX90A-GISEL-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX90A-GISEL-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_multi_use_mul_chain_add_other_use_some:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v2, v0
@@ -4827,6 +4824,7 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a
 ; GFX10-NEXT:    global_store_dword v[3:4], v5, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v5, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = add i32 %arg, 1
@@ -4844,7 +4842,6 @@ bb:
 define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -4861,11 +4858,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -4882,11 +4879,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -4903,11 +4900,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
@@ -4924,11 +4921,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4943,11 +4940,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v2, v[0:1]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[1:2]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4964,11 +4961,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -4983,11 +4980,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v2, v[0:1]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[2:3]
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -5004,11 +5001,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, v2, v0
@@ -5023,11 +5020,11 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v0, v2, v[0:1]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, v[1:2]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i32_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, v2, v0
@@ -5044,6 +5041,7 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add i32 %x, 1
@@ -5068,7 +5066,6 @@ entry:
 define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -5101,11 +5098,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-SDAG-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5138,11 +5135,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX67-GISEL-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -5175,11 +5172,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX8-SDAG-NEXT:    v_mul_lo_u32 v1, v3, v1
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-SDAG-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5212,11 +5209,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX900-SDAG:       ; %bb.0: ; %entry
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX900-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
@@ -5245,11 +5242,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v5, v[2:3]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v0, v[3:4]
 ; GFX900-SDAG-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v2, v[4:5]
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5282,11 +5279,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX900-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX90A-SDAG:       ; %bb.0: ; %entry
-; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-SDAG-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5316,11 +5313,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, v[6:7]
 ; GFX90A-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v2, v[4:5]
 ; GFX90A-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
-; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v0, 1, v0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5353,11 +5350,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v0, v0, v4
 ; GFX90A-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v5
+; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-SDAG-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-SDAG-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5386,11 +5383,11 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[4:5], null, v2, v5, v[2:3]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v0, v[3:4]
 ; GFX10-SDAG-NEXT:    v_mad_u64_u32 v[1:2], null, v4, v2, v[4:5]
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i32_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -5423,6 +5420,7 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX10-GISEL-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i32> %x, <i32 1, i32 1>
@@ -5447,7 +5445,6 @@ entry:
 define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -5481,11 +5478,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -5519,11 +5516,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
@@ -5533,11 +5530,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -5555,11 +5552,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX8-GISEL-NEXT:    v_mad_u16 v1, v1, v4, 1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
@@ -5569,11 +5566,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -5591,11 +5588,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v1, v1, v4, 1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
@@ -5605,11 +5602,11 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_i16_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
@@ -5627,6 +5624,7 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv69 = add i16 %x, 1
@@ -5651,7 +5649,6 @@ entry:
 define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v0, v1
@@ -5684,11 +5681,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
@@ -5721,11 +5718,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
@@ -5734,11 +5731,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -5755,11 +5752,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX8-GISEL-NEXT:    v_mad_u16 v1, v1, v4, 1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
@@ -5768,11 +5765,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v1, v0, v1, v0
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v1, v0, v1
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_add_u16_e32 v3, 1, v1
@@ -5789,11 +5786,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v1, v1, v4, 1
 ; GFX9-GISEL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v1, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
@@ -5803,11 +5800,11 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v1, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_i16_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, 1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
@@ -5825,6 +5822,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v1, 1
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %conv69 = add i16 %x, 1
@@ -5849,7 +5847,6 @@ entry:
 define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
@@ -5902,11 +5899,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
@@ -5997,11 +5994,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6030,11 +6027,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v3, v1, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6064,11 +6061,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6085,11 +6082,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6106,11 +6103,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6127,11 +6124,11 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6148,6 +6145,7 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -6172,7 +6170,6 @@ entry:
 define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
@@ -6225,11 +6222,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
@@ -6320,11 +6317,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-SDAG-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-SDAG-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6353,11 +6350,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-SDAG-NEXT:    v_mad_u16 v1, v3, v1, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-GISEL-NEXT:    v_add_u16_e32 v2, 1, v0
 ; GFX8-GISEL-NEXT:    v_add_u16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6387,11 +6384,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6408,11 +6405,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6429,11 +6426,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6450,11 +6447,11 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-SDAG-NEXT:    v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v2, v0
@@ -6471,6 +6468,7 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y38 = add <2 x i16> %x, <i16 1, i16 1>
@@ -6496,34 +6494,34 @@ entry:
 define <2 x i32> @multi_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z0, i32 %z1) {
 ; GFX67-LABEL: multi_use_mul_mad_i32_var:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: multi_use_mul_mad_i32_var:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: multi_use_mul_mad_i32_var:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: multi_use_mul_mad_i32_var:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i32 %x, %y
@@ -6537,74 +6535,74 @@ entry:
 define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) {
 ; GFX67-SDAG-LABEL: multi_use_mul_mad_i16_var:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mad_u32_u24 v0, v4, v1, v2
 ; GFX67-SDAG-NEXT:    v_mad_u32_u24 v1, v4, v1, v3
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: multi_use_mul_mad_i16_var:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: multi_use_mul_mad_i16_var:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mad_u16 v3, v0, v1, v3
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX8-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: multi_use_mul_mad_i16_var:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u16 v2, v0, v1, v2
 ; GFX8-GISEL-NEXT:    v_mad_u16 v0, v0, v1, v3
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: multi_use_mul_mad_i16_var:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v2, v0, v1, v2
 ; GFX9-SDAG-NEXT:    v_mad_legacy_u16 v0, v0, v1, v3
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: multi_use_mul_mad_i16_var:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v2, v0, v1, v2
 ; GFX9-GISEL-NEXT:    v_mad_legacy_u16 v0, v0, v1, v3
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: multi_use_mul_mad_i16_var:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mad_u16 v2, v0, v1, v2
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v3
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: multi_use_mul_mad_i16_var:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i16 %x, %y
@@ -6618,40 +6616,36 @@ entry:
 define i32 @other_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z, ptr addrspace(3) %ptr) {
 ; GFX67-LABEL: other_use_mul_mad_i32_var:
 ; GFX67:       ; %bb.0: ; %entry
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    s_mov_b32 m0, -1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
 ; GFX67-NEXT:    ds_write_b32 v3, v1
-; GFX67-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: other_use_mul_mad_i32_var:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v2
 ; GFX8-NEXT:    ds_write_b32 v3, v1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: other_use_mul_mad_i32_var:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v2
 ; GFX9-NEXT:    ds_write_b32 v3, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: other_use_mul_mad_i32_var:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v1, v2
 ; GFX10-NEXT:    ds_write_b32 v3, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i32 %x, %y
@@ -6663,63 +6657,57 @@ entry:
 define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) %ptr) {
 ; GFX67-SDAG-LABEL: other_use_mul_mad_i16_var:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    v_mul_u32_u24_e32 v4, v0, v1
 ; GFX67-SDAG-NEXT:    v_mad_u32_u24 v0, v0, v1, v2
 ; GFX67-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX67-SDAG-NEXT:    ds_write_b16 v3, v4
-; GFX67-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: other_use_mul_mad_i16_var:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v0, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
 ; GFX67-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX67-GISEL-NEXT:    ds_write_b16 v3, v1
-; GFX67-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: other_use_mul_mad_i16_var:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v4, v0, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write_b16 v3, v4
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: other_use_mul_mad_i16_var:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v4, v0, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v2
 ; GFX9-NEXT:    ds_write_b16 v3, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: other_use_mul_mad_i16_var:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mul_lo_u16 v4, v0, v1
 ; GFX10-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX10-SDAG-NEXT:    ds_write_b16 v3, v4
-; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: other_use_mul_mad_i16_var:
 ; GFX10-GISEL:       ; %bb.0: ; %entry
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v1, v0, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v1, v2
 ; GFX10-GISEL-NEXT:    ds_write_b16 v3, v1
-; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul i16 %x, %y
@@ -6731,7 +6719,6 @@ entry:
 define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i16> %z0, <2 x i16> %z1) {
 ; GFX67-SDAG-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -6748,11 +6735,11 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-SDAG-NEXT:    v_or_b32_e32 v2, v2, v1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v5
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
@@ -6763,11 +6750,11 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v5
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -6781,11 +6768,11 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX8-SDAG-NEXT:    v_mad_u16 v0, v0, v1, v3
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v0, v4
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -6799,22 +6786,23 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX8-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: multi_use_mul_mad_v2i16_var:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul <2 x i16> %x, %y
@@ -6827,7 +6815,6 @@ entry:
 define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i16> %z, ptr addrspace(3) %ptr) {
 ; GFX67-SDAG-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX67-SDAG:       ; %bb.0: ; %entry
-; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -6845,12 +6832,11 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX67-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-SDAG-NEXT:    ds_write_b32 v6, v7
-; GFX67-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX67-GISEL-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
-; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
@@ -6865,12 +6851,11 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX67-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX67-GISEL-NEXT:    ds_write_b32 v6, v2
-; GFX67-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-SDAG-NEXT:    v_mul_lo_u16_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -6883,12 +6868,11 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v3, v6
-; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX8-GISEL-NEXT:    v_mul_lo_u16_e32 v6, v0, v1
@@ -6901,25 +6885,23 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX8-GISEL-NEXT:    ds_write_b32 v3, v6
-; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX9-NEXT:    v_pk_add_u16 v0, v1, v2
 ; GFX9-NEXT:    ds_write_b32 v3, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: other_use_mul_mad_v2i16_var:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX10-NEXT:    v_pk_add_u16 v0, v1, v2
 ; GFX10-NEXT:    ds_write_b32 v3, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %mul = mul <2 x i16> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-nan.ll b/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
index 34aecd77c2506f7..513a084bc007c49 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-nan.ll
@@ -4,13 +4,13 @@
 define half @known_nnan_extract_vector_elt(float %a, float %b, i32 %idx, half %c) {
 ; CHECK-LABEL: known_nnan_extract_vector_elt:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_pkrtz_f16_f32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
 ; CHECK-NEXT:    v_add_f16_e32 v2, 1.0, v3
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; CHECK-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %cvt = call nnan <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
   %extract = extractelement <2 x half> %cvt, i32 %idx
@@ -25,12 +25,12 @@ define half @known_nnan_extract_vector_elt(float %a, float %b, i32 %idx, half %c
 define float @fma_not_fmaxnm_maybe_nan(i32 %i1, i32 %i2, i32 %i3) #0 {
 ; CHECK-LABEL: fma_not_fmaxnm_maybe_nan:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; CHECK-NEXT:    v_fmaak_f32 v0, v1, v0, 0xff800000
 ; CHECK-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 0xff800000, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0xff800000, v0, vcc_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %f1 = uitofp i32 %i1 to float
   %f2 = uitofp i32 %i2 to float
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index 64948c374e4ddc6..67fec4d2e1928ff 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -7,9 +7,9 @@
 define float @v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, |v0|, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.fabs.f32(float %a.nnan.add)
@@ -21,9 +21,9 @@ define float @v_test_known_not_snan_fabs_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, -v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = fneg float %a.nnan.add
@@ -35,10 +35,10 @@ define float @v_test_known_not_snan_fneg_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_fpext_input_fmed3_r_i_i_f32(half %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fpext_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fadd nnan half %a, 1.0
   %known.not.snan = fpext half %a.nnan.add to float
@@ -50,10 +50,10 @@ define float @v_test_known_not_snan_fpext_input_fmed3_r_i_i_f32(half %a) #0 {
 define float @v_test_known_not_snan_fptrunc_input_fmed3_r_i_i_f32(double %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fptrunc_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fadd nnan double %a, 1.0
   %known.not.snan = fptrunc double %a.nnan.add to float
@@ -65,11 +65,11 @@ define float @v_test_known_not_snan_fptrunc_input_fmed3_r_i_i_f32(double %a) #0
 define float @v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32(float %a, float %sign) #0 {
 ; GCN-LABEL: v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_brev_b32 s4, -2
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.copysign.f32(float %a.nnan.add, float %sign)
@@ -82,9 +82,9 @@ define float @v_test_known_not_snan_copysign_input_fmed3_r_i_i_f32(float %a, flo
 define float @v_test_known_canonicalize_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_canonicalize_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.canonicalize.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -95,11 +95,11 @@ define float @v_test_known_canonicalize_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -112,11 +112,11 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
 define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nsnan = fadd float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
@@ -128,11 +128,11 @@ define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float
 define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nsnan = fadd float %a, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
@@ -144,11 +144,11 @@ define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float
 define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
@@ -160,11 +160,11 @@ define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
@@ -176,11 +176,11 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -193,11 +193,11 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
 define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
@@ -209,11 +209,11 @@ define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
@@ -225,12 +225,12 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_select_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -244,12 +244,12 @@ define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float
 define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 {
 ; GCN-LABEL: v_select_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
@@ -262,12 +262,12 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b, i32 %c) #0 {
 ; GCN-LABEL: v_select_possible_nan_rhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
   %cmp = icmp eq i32 %c, 0
@@ -280,9 +280,9 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_test_known_not_snan_fadd_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fadd_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = fadd float %a, %b
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -293,9 +293,9 @@ define float @v_test_known_not_snan_fadd_input_fmed3_r_i_i_f32(float %a, float %
 define float @v_test_known_not_snan_fsub_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fsub_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = fsub float %a, %b
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -306,9 +306,9 @@ define float @v_test_known_not_snan_fsub_input_fmed3_r_i_i_f32(float %a, float %
 define float @v_test_known_not_snan_fmul_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fmul_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = fmul float %a, %b
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -319,9 +319,9 @@ define float @v_test_known_not_snan_fmul_input_fmed3_r_i_i_f32(float %a, float %
 define float @v_test_known_not_snan_uint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_uint_to_fp_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = uitofp i32 %a to float
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -332,9 +332,9 @@ define float @v_test_known_not_snan_uint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0
 define float @v_test_known_not_snan_sint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_sint_to_fp_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = sitofp i32 %a to float
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -345,9 +345,9 @@ define float @v_test_known_not_snan_sint_to_fp_input_fmed3_r_i_i_f32(i32 %a) #0
 define float @v_test_known_not_snan_fma_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fma_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.fma.f32(float %a, float %b, float %c)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -358,9 +358,9 @@ define float @v_test_known_not_snan_fma_input_fmed3_r_i_i_f32(float %a, float %b
 define float @v_test_known_not_snan_fmad_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fmad_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v2, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -372,11 +372,11 @@ define float @v_test_known_not_snan_fmad_input_fmed3_r_i_i_f32(float %a, float %
 define float @v_test_known_not_snan_sin_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_sin_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0.15915494, v0
 ; GCN-NEXT:    v_fract_f32_e32 v0, v0
 ; GCN-NEXT:    v_sin_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.sin.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -387,11 +387,11 @@ define float @v_test_known_not_snan_sin_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_cos_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_cos_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0.15915494, v0
 ; GCN-NEXT:    v_fract_f32_e32 v0, v0
 ; GCN-NEXT:    v_cos_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.cos.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -402,9 +402,9 @@ define float @v_test_known_not_snan_cos_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_exp2_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_exp2_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.exp2.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -415,9 +415,9 @@ define float @v_test_known_not_snan_exp2_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_trunc_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_trunc_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trunc_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.trunc.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -428,9 +428,9 @@ define float @v_test_known_not_snan_trunc_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_floor_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_floor_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_floor_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.floor.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -441,9 +441,9 @@ define float @v_test_known_not_snan_floor_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_ceil_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_ceil_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_floor_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.floor.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -454,7 +454,6 @@ define float @v_test_known_not_snan_ceil_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_round_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v0
 ; GCN-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -463,6 +462,7 @@ define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-NEXT:    v_bfi_b32 v0, s4, v2, v0
 ; GCN-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.round.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -473,9 +473,9 @@ define float @v_test_known_not_snan_round_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_rint_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_rint_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rndne_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.rint.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -486,9 +486,9 @@ define float @v_test_known_not_snan_rint_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_nearbyint_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_nearbyint_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rndne_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.nearbyint.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -499,9 +499,9 @@ define float @v_test_known_not_snan_nearbyint_input_fmed3_r_i_i_f32(float %a) #0
 define float @v_test_known_not_snan_fmul_legacy_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fmul_legacy_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -512,9 +512,9 @@ define float @v_test_known_not_snan_fmul_legacy_input_fmed3_r_i_i_f32(float %a,
 define float @v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32(float %a, i32 %b) #0 {
 ; GCN-LABEL: v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.ldexp.f32.i32(float %a, i32 %b)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -525,10 +525,10 @@ define float @v_test_known_not_snan_ldexp_input_fmed3_r_i_i_f32(float %a, i32 %b
 define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -539,9 +539,9 @@ define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float
 define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min0 = call float @llvm.minnum.f32(float %a, float %b)
   %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)
@@ -553,9 +553,9 @@ define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float
 define float @v_test_known_not_snan_cvt_ubyte0_input_fmed3_r_i_i_f32(i8 %char) #0 {
 ; GCN-LABEL: v_test_known_not_snan_cvt_ubyte0_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %char to float
   %max = call float @llvm.maxnum.f32(float %cvt, float 2.0)
@@ -566,9 +566,9 @@ define float @v_test_known_not_snan_cvt_ubyte0_input_fmed3_r_i_i_f32(i8 %char) #
 define float @v_test_not_known_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 {
 ; GCN-LABEL: v_test_not_known_frexp_mant_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.frexp.mant.f32(float %arg)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -579,10 +579,10 @@ define float @v_test_not_known_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 {
 define float @v_test_known_not_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 {
 ; GCN-LABEL: v_test_known_not_frexp_mant_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %arg, 1.0
   %known.not.snan = call float @llvm.amdgcn.frexp.mant.f32(float %add)
@@ -594,9 +594,9 @@ define float @v_test_known_not_frexp_mant_input_fmed3_r_i_i_f32(float %arg) #0 {
 define float @v_test_known_not_snan_rcp_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_rcp_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.rcp.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -606,9 +606,9 @@ define float @v_test_known_not_snan_rcp_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_rsq_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_rsq_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.rsq.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -619,9 +619,9 @@ define float @v_test_known_not_snan_rsq_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_fract_input_fmed3_r_i_i_f32(float %a) #0 {
 ; GCN-LABEL: v_test_known_not_snan_fract_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fract_f32_e32 v0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fract.f32(float %a)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
@@ -632,9 +632,9 @@ define float @v_test_known_not_snan_fract_input_fmed3_r_i_i_f32(float %a) #0 {
 define float @v_test_known_not_snan_cubeid_input_fmed3_r_i_i_f32(float %a, float %b, float %c) #0 {
 ; GCN-LABEL: v_test_known_not_snan_cubeid_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cubeid_f32 v0, v0, v1, v2
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index c22eb877bd56b62..ade6e23c19a7687 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -16,10 +16,9 @@
 define void @use_module() #0 {
 ; CHECK-LABEL: use_module:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   store i16 0, ptr addrspace(3) @module_variable
   ret void
@@ -38,7 +37,6 @@ define void @use_module() #0 {
 define void @use_extern_normal() #0 {
 ; CHECK-LABEL: use_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table at rel32@hi+12
@@ -52,7 +50,7 @@ define void @use_extern_normal() #0 {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    ds_write_b32 v1, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 0
   store float 0x40091EB860000000, ptr addrspace(3) %arrayidx
@@ -62,7 +60,6 @@ define void @use_extern_normal() #0 {
 define void @use_extern_overalign() #0 {
 ; CHECK-LABEL: use_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table at rel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table at rel32@hi+12
@@ -76,7 +73,7 @@ define void @use_extern_overalign() #0 {
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    ds_write_b32 v1, v0 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 1
   store float 4.200000e+01, ptr addrspace(3) %arrayidx
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index 7ba4b314ebdb86c..89a1d378767ec8c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -30,64 +30,58 @@
 define void @func_use_lds_global() {
 ; GFX8-SDAG-LABEL: func_use_lds_global:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-SDAG-NEXT:    s_trap 2
-; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_use_lds_global:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX8-GISEL-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-GISEL-NEXT:    s_trap 2
 ; GFX8-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_use_lds_global:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX9-SDAG-NEXT:    s_trap 2
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_use_lds_global:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-GISEL-NEXT:    s_trap 2
 ; GFX9-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_use_lds_global:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    ds_write_b32 v0, v0
 ; SDAG-NEXT:    s_cbranch_execnz .LBB0_2
 ; SDAG-NEXT:  ; %bb.1:
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB0_2:
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: func_use_lds_global:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_cbranch_execnz .LBB0_2
 ; GISEL-NEXT:  ; %bb.1:
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    ds_write_b32 v0, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB0_2:
 ; GISEL-NEXT:    s_endpgm
@@ -99,50 +93,47 @@ define void @func_use_lds_global() {
 define void @func_use_lds_global_constexpr_cast() {
 ; GFX8-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-SDAG-NEXT:    s_trap 2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-GISEL-NEXT:    s_trap 2
 ; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v0
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_trap 2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_use_lds_global_constexpr_cast:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_trap 2
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_use_lds_global_constexpr_cast:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_cbranch_execnz .LBB1_2
 ; SDAG-NEXT:  ; %bb.1:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB1_2:
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: func_use_lds_global_constexpr_cast:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_cbranch_execnz .LBB1_2
 ; GISEL-NEXT:  ; %bb.1:
 ; GISEL-NEXT:    global_store_dword v[0:1], v0, off
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB1_2:
 ; GISEL-NEXT:    s_endpgm
@@ -154,11 +145,11 @@ define void @func_use_lds_global_constexpr_cast() {
 define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-LABEL: func_uses_lds_multi:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX8-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX8-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB2_2
@@ -181,15 +172,15 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX8-SDAG-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-SDAG-NEXT:    s_trap 2
-; GFX8-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_multi:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX8-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-GISEL-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX8-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; GFX8-GISEL-NEXT:    s_cbranch_execz .LBB2_2
@@ -215,15 +206,15 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX8-GISEL-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-GISEL-NEXT:    s_trap 2
 ; GFX8-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX8-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_multi:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB2_2
@@ -243,15 +234,15 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX9-SDAG-NEXT:    ds_write_b32 v0, v0
 ; GFX9-SDAG-NEXT:    s_trap 2
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_multi:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX9-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB2_2
@@ -271,15 +262,15 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX9-GISEL-NEXT:    s_trap 2
 ; GFX9-GISEL-NEXT:    ds_write_b32 v0, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_multi:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SDAG-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; SDAG-NEXT:    s_waitcnt expcnt(0)
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_2
@@ -300,17 +291,17 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; SDAG-NEXT:    ds_write_b32 v0, v0
 ; SDAG-NEXT:    s_cbranch_execnz .LBB2_6
 ; SDAG-NEXT:  ; %bb.5: ; %ret
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB2_6:
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: func_uses_lds_multi:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_3
@@ -333,7 +324,7 @@ define void @func_uses_lds_multi(i1 %cond) {
 ; GISEL-NEXT:  ; %bb.7: ; %ret
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 2
 ; GISEL-NEXT:    ds_write_b32 v0, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB2_8:
 ; GISEL-NEXT:    s_endpgm
@@ -357,7 +348,6 @@ ret:
 define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-LABEL: func_uses_lds_code_after:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX8-SDAG-NEXT:    ds_write_b32 v0, v2
@@ -365,12 +355,11 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; GFX8-SDAG-NEXT:    s_trap 2
 ; GFX8-SDAG-NEXT:    flat_store_dword v[0:1], v2
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_code_after:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX8-GISEL-NEXT:    s_mov_b64 s[0:1], s[6:7]
@@ -378,55 +367,51 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 ; GFX8-GISEL-NEXT:    ds_write_b32 v0, v2
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_code_after:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-SDAG-NEXT:    ds_write_b32 v0, v2
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    s_trap 2
 ; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_code_after:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_trap 2
 ; GFX9-GISEL-NEXT:    ds_write_b32 v0, v2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-LABEL: func_uses_lds_code_after:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-NEXT:    ds_write_b32 v0, v2
 ; SDAG-NEXT:    s_cbranch_execnz .LBB3_2
 ; SDAG-NEXT:  ; %bb.1:
 ; SDAG-NEXT:    v_mov_b32_e32 v2, 1
 ; SDAG-NEXT:    global_store_dword v[0:1], v2, off
-; SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ; SDAG-NEXT:  .LBB3_2:
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: func_uses_lds_code_after:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_cbranch_execnz .LBB3_2
 ; GISEL-NEXT:  ; %bb.1:
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-NEXT:    ds_write_b32 v0, v2
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 1
 ; GISEL-NEXT:    global_store_dword v[0:1], v2, off
-; GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ; GISEL-NEXT:  .LBB3_2:
 ; GISEL-NEXT:    s_endpgm
@@ -439,12 +424,12 @@ define void @func_uses_lds_code_after(ptr addrspace(1) %ptr) {
 define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ; GFX8-SDAG-LABEL: func_uses_lds_phi_after:
 ; GFX8-SDAG:       ; %bb.0: ; %entry
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-SDAG-NEXT:    flat_load_dword v0, v[1:2] glc
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-SDAG-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-SDAG-NEXT:  ; %bb.1: ; %use.bb
@@ -462,12 +447,12 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ;
 ; GFX8-GISEL-LABEL: func_uses_lds_phi_after:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX8-GISEL-NEXT:    flat_load_dword v0, v[1:2] glc
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX8-GISEL-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-GISEL-NEXT:  ; %bb.1: ; %use.bb
@@ -485,12 +470,12 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ;
 ; GFX9-SDAG-LABEL: func_uses_lds_phi_after:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-SDAG-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-SDAG-NEXT:  ; %bb.1: ; %use.bb
@@ -506,12 +491,12 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ;
 ; GFX9-GISEL-LABEL: func_uses_lds_phi_after:
 ; GFX9-GISEL:       ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-GISEL-NEXT:  ; %bb.1: ; %use.bb
@@ -527,12 +512,12 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ;
 ; SDAG-LABEL: func_uses_lds_phi_after:
 ; SDAG:       ; %bb.0: ; %entry
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v3, v0
 ; SDAG-NEXT:    global_load_dword v0, v[1:2], off glc
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    v_and_b32_e32 v3, 1, v3
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; SDAG-NEXT:    s_waitcnt expcnt(0)
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB4_3
 ; SDAG-NEXT:  ; %bb.1: ; %use.bb
@@ -551,12 +536,12 @@ define i32 @func_uses_lds_phi_after(i1 %cond, ptr addrspace(1) %ptr) {
 ;
 ; GISEL-LABEL: func_uses_lds_phi_after:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v3, v0
 ; GISEL-NEXT:    global_load_dword v0, v[1:2], off glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    s_waitcnt expcnt(0)
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:    s_cbranch_execz .LBB4_3
 ; GISEL-NEXT:  ; %bb.1: ; %use.bb
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
index 815e5c709ff7cb9..26bcd25dff60e1a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -7,9 +7,9 @@ declare i64 @llvm.amdgcn.s.bitreplicate(i32)
 define i64 @test_s_bitreplicate_constant() {
 ; GFX11-LABEL: test_s_bitreplicate_constant:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xfccc30c
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xc033fffc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
@@ -19,8 +19,8 @@ entry:
 define i64 @test_s_bitreplicate_constant_zero() {
 ; GFX11-LABEL: test_s_bitreplicate_constant_zero:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 0)
@@ -30,8 +30,8 @@ entry:
 define i64 @test_s_bitreplicate_constant_neg_one() {
 ; GFX11-LABEL: test_s_bitreplicate_constant_neg_one:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 -1)
@@ -41,9 +41,9 @@ entry:
 define i64 @test_s_bitreplicate_constant_undef() {
 ; GFX11-LABEL: test_s_bitreplicate_constant_undef:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 undef)
@@ -53,9 +53,9 @@ entry:
 define i64 @test_s_bitreplicate_constant_poison() {
 ; GFX11-LABEL: test_s_bitreplicate_constant_poison:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 poison)
@@ -80,10 +80,10 @@ entry:
 define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
 ; GFX11-LABEL: test_s_bitreplicate_vgpr:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
index 99a092e310abbd9..d3bc8dff6ed2bcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp2.ll
@@ -5,8 +5,8 @@
 define float @v_exp2_f32(float %src)  {
 ; GCN-LABEL: v_exp2_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %exp2 = call float @llvm.amdgcn.exp2.f32(float %src)
   ret float %exp2
@@ -15,8 +15,8 @@ define float @v_exp2_f32(float %src)  {
 define float @v_fabs_exp2_f32(float %src)  {
 ; GCN-LABEL: v_fabs_exp2_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %exp2 = call float @llvm.amdgcn.exp2.f32(float %fabs.src)
@@ -26,8 +26,8 @@ define float @v_fabs_exp2_f32(float %src)  {
 define float @v_fneg_fabs_exp2_f32(float %src)  {
 ; GCN-LABEL: v_fneg_fabs_exp2_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %neg.fabs.src = fneg float %fabs.src
@@ -38,8 +38,8 @@ define float @v_fneg_fabs_exp2_f32(float %src)  {
 define half @v_exp2_f16(half %src)  {
 ; GCN-LABEL: v_exp2_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %exp2 = call half @llvm.amdgcn.exp2.f16(half %src)
   ret half %exp2
@@ -48,8 +48,8 @@ define half @v_exp2_f16(half %src)  {
 define half @v_fabs_exp2_f16(half %src)  {
 ; GCN-LABEL: v_fabs_exp2_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f16_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %exp2 = call half @llvm.amdgcn.exp2.f16(half %fabs.src)
@@ -59,8 +59,8 @@ define half @v_fabs_exp2_f16(half %src)  {
 define half @v_fneg_fabs_exp2_f16(half %src)  {
 ; GCN-LABEL: v_fneg_fabs_exp2_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f16_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %neg.fabs.src = fneg half %fabs.src
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll
index 4b2720893563333..a820f80efe5be3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fma.legacy.ll
@@ -7,14 +7,14 @@
 define float @v_fma(float %a, float %b, float %c)  {
 ; GFX10-LABEL: v_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %c)
   ret float %fma
@@ -23,14 +23,14 @@ define float @v_fma(float %a, float %b, float %c)  {
 define float @v_fma_imm(float %a, float %c)  {
 ; GFX10-LABEL: v_fma_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, 0x41200000, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_imm:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, 0x41200000, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.amdgcn.fma.legacy(float %a, float 10.0, float %c)
   ret float %fma
@@ -39,14 +39,14 @@ define float @v_fma_imm(float %a, float %c)  {
 define float @v_fabs_fma(float %a, float %b, float %c)  {
 ; GFX10-LABEL: v_fabs_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, |v0|, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fabs_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, |v0|, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.a = call float @llvm.fabs.f32(float %a)
   %fma = call float @llvm.amdgcn.fma.legacy(float %fabs.a, float %b, float %c)
@@ -56,14 +56,14 @@ define float @v_fabs_fma(float %a, float %b, float %c)  {
 define float @v_fneg_fabs_fma(float %a, float %b, float %c)  {
 ; GFX10-LABEL: v_fneg_fabs_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, v0, -|v1|, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fabs_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, v0, -|v1|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fabs.b = call float @llvm.fabs.f32(float %b)
   %neg.fabs.b = fneg float %fabs.b
@@ -74,14 +74,14 @@ define float @v_fneg_fabs_fma(float %a, float %b, float %c)  {
 define float @v_fneg_fma(float %a, float %b, float %c)  {
 ; GFX10-LABEL: v_fneg_fma:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, v0, v1, -v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fneg_fma:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, v0, v1, -v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.c = fneg float %c
   %fma = call float @llvm.amdgcn.fma.legacy(float %a, float %b, float %neg.c)
@@ -91,14 +91,14 @@ define float @v_fneg_fma(float %a, float %b, float %c)  {
 define float @v_fma_const_const(float %a)  {
 ; GFX10-LABEL: v_fma_const_const:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_legacy_f32 v0, v0, 2.0, -1.0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fma_const_const:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_dx9_zero_f32 v0, v0, 2.0, -1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.amdgcn.fma.legacy(float %a, float 2.0, float -1.0)
   ret float %fma
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818dafd2b84c7..c19d28538facaed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -7,11 +7,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 define void @function_lds_id(ptr addrspace(1) %out) {
 ; GCN-LABEL: function_lds_id:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_add_i32 s4, s15, s12
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id()
   %help = call i32 @llvm.amdgcn.workgroup.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
index ebdda381a0762c8..7d940b05e912c15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.ll
@@ -5,8 +5,8 @@
 define float @v_log_f32(float %src)  {
 ; GCN-LABEL: v_log_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %log = call float @llvm.amdgcn.log.f32(float %src)
   ret float %log
@@ -15,8 +15,8 @@ define float @v_log_f32(float %src)  {
 define float @v_fabs_log_f32(float %src)  {
 ; GCN-LABEL: v_fabs_log_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f32_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %log = call float @llvm.amdgcn.log.f32(float %fabs.src)
@@ -26,8 +26,8 @@ define float @v_fabs_log_f32(float %src)  {
 define float @v_fneg_fabs_log_f32(float %src)  {
 ; GCN-LABEL: v_fneg_fabs_log_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %neg.fabs.src = fneg float %fabs.src
@@ -38,8 +38,8 @@ define float @v_fneg_fabs_log_f32(float %src)  {
 define half @v_log_f16(half %src)  {
 ; GCN-LABEL: v_log_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %log = call half @llvm.amdgcn.log.f16(half %src)
   ret half %log
@@ -48,8 +48,8 @@ define half @v_log_f16(half %src)  {
 define half @v_fabs_log_f16(half %src)  {
 ; GCN-LABEL: v_fabs_log_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f16_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %log = call half @llvm.amdgcn.log.f16(half %fabs.src)
@@ -59,8 +59,8 @@ define half @v_fabs_log_f16(half %src)  {
 define half @v_fneg_fabs_log_f16(half %src)  {
 ; GCN-LABEL: v_fneg_fabs_log_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_log_f16_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %neg.fabs.src = fneg half %fabs.src
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
index dc9698d19d76505..9867ff020bf2c1a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.i24.ll
@@ -5,8 +5,8 @@
 define i32 @basic(i32 %a, i32 %b) {
 ; CHECK-LABEL: basic:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mul_hi_i32_i24_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i32 @llvm.amdgcn.mulhi.i24(i32 %a, i32 %b)
   ret i32 %mul
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
index 92120b61dca1b41..0d4a03e774956f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mulhi.u24.ll
@@ -5,8 +5,8 @@
 define i32 @basic(i32 %a, i32 %b) {
 ; CHECK-LABEL: basic:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mul_hi_u32_u24_e32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %mul = call i32 @llvm.amdgcn.mulhi.u24(i32 %a, i32 %b)
   ret i32 %mul
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index d7a84743e78c5b1..e6f52675222340b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -8,8 +8,8 @@ declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
 define i32 @test_quadmask_constant_zero_i32() {
 ; GFX11-LABEL: test_quadmask_constant_zero_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 0)
@@ -19,8 +19,8 @@ entry:
 define i32 @test_quadmask_constant_neg_one_i32() {
 ; GFX11-LABEL: test_quadmask_constant_neg_one_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 -1)
@@ -30,9 +30,9 @@ entry:
 define i32 @test_quadmask_constant_undef_i32() {
 ; GFX11-LABEL: test_quadmask_constant_undef_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_quadmask_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 undef)
@@ -42,9 +42,9 @@ entry:
 define i32 @test_quadmask_constant_poison_i32() {
 ; GFX11-LABEL: test_quadmask_constant_poison_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_quadmask_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 poison)
@@ -54,8 +54,8 @@ entry:
 define i32 @test_quadmask_constant_i32() {
 ; GFX11-LABEL: test_quadmask_constant_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xcb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 u0x85003092)
@@ -81,10 +81,10 @@ entry:
 define i32 @test_quadmask_vgpr_i32(i32 %mask) {
 ; GFX11-LABEL: test_quadmask_vgpr_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_quadmask_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
@@ -94,8 +94,8 @@ entry:
 define i64 @test_quadmask_constant_i64() {
 ; GFX11-LABEL: test_quadmask_constant_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0xe3e6 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 u0x67D000FC85F00A90)
@@ -105,8 +105,8 @@ entry:
 define i64 @test_quadmask_constant_zero_i64() {
 ; GFX11-LABEL: test_quadmask_constant_zero_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 0)
@@ -116,8 +116,8 @@ entry:
 define i64 @test_quadmask_constant_neg_one_i64() {
 ; GFX11-LABEL: test_quadmask_constant_neg_one_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0xffff :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 -1)
@@ -127,9 +127,9 @@ entry:
 define i64 @test_quadmask_constant_undef_i64() {
 ; GFX11-LABEL: test_quadmask_constant_undef_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 undef)
@@ -139,9 +139,9 @@ entry:
 define i64 @test_quadmask_constant_poison_i64() {
 ; GFX11-LABEL: test_quadmask_constant_poison_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 poison)
@@ -166,11 +166,11 @@ entry:
 define i64 @test_quadmask_vgpr_i64(i64 %mask) {
 ; GFX11-LABEL: test_quadmask_vgpr_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
index 8dd414f78340bb0..24038a90f9f5c61 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll
@@ -9,7 +9,6 @@ declare void @llvm.amdgcn.s.setprio(i16) #0
 define void @test_llvm_amdgcn_s_setprio() #0 {
 ; GFX9-LABEL: test_llvm_amdgcn_s_setprio:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX9-NEXT:    s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
 ; GFX9-NEXT:    s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
 ; GFX9-NEXT:    s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
@@ -19,11 +18,11 @@ define void @test_llvm_amdgcn_s_setprio() #0 {
 ; GFX9-NEXT:    s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
 ; GFX9-NEXT:    s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
 ; GFX9-NEXT:    s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
 ;
 ; SI-LABEL: test_llvm_amdgcn_s_setprio:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; SI-NEXT:    s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
 ; SI-NEXT:    s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
 ; SI-NEXT:    s_setprio 2 ; encoding: [0x02,0x00,0x8f,0xbf]
@@ -33,6 +32,7 @@ define void @test_llvm_amdgcn_s_setprio() #0 {
 ; SI-NEXT:    s_setprio 0 ; encoding: [0x00,0x00,0x8f,0xbf]
 ; SI-NEXT:    s_setprio 1 ; encoding: [0x01,0x00,0x8f,0xbf]
 ; SI-NEXT:    s_setprio -1 ; encoding: [0xff,0xff,0x8f,0xbf]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
 ; SI-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
   call void @llvm.amdgcn.s.setprio(i16 0)
   call void @llvm.amdgcn.s.setprio(i16 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 6fd6d6e2e31a1c2..a021745d27ca1cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -11,20 +11,18 @@
 define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %inactive, i32 %active) {
 ; GFX11-LABEL: set_inactive_chain_arg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v11
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
@@ -33,20 +31,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ;
 ; GFX11_W64-LABEL: set_inactive_chain_arg:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v11
+; GFX11_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v11
+; GFX10_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
@@ -60,23 +56,21 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i64 %inactive, i64 %active) {
 ; GFX11-LABEL: set_inactive_chain_arg_64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v13
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v13
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v11
@@ -86,23 +80,21 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ;
 ; GFX11_W64-LABEL: set_inactive_chain_arg_64:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GFX11_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_64:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v13
+; GFX10_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX10_W64-NEXT:    s_not_b64 exec, exec
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, v11
@@ -117,7 +109,7 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i32 %inactive, i32 %active) {
 ; GFX11-LABEL: set_inactive_chain_arg_dpp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
@@ -133,13 +125,11 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_dpp:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s0
@@ -157,7 +147,7 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ;
 ; GFX11_W64-LABEL: set_inactive_chain_arg_dpp:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -174,13 +164,11 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_dpp:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX10_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -207,9 +195,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out, i32 %inactive, i32 %active) {
 ; GISEL11-LABEL: set_inactive_chain_arg_call:
 ; GISEL11:       ; %bb.0:
-; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL11-NEXT:    s_mov_b32 s32, 0
 ; GISEL11-NEXT:    v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v42, v9
+; GISEL11-NEXT:    s_waitcnt expcnt(0)
 ; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GISEL11-NEXT:    v_mov_b32_e32 v40, v10
 ; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
@@ -233,15 +221,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_call:
 ; DAGISEL11:       ; %bb.0:
-; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v43, v11
+; DAGISEL11-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v40, v10
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
@@ -265,16 +251,14 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_call:
 ; GISEL10:       ; %bb.0:
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL10-NEXT:    s_mov_b32 s32, 0
 ; GISEL10-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL10-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL10-NEXT:    s_waitcnt expcnt(0)
 ; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GISEL10-NEXT:    v_mov_b32_e32 v40, v10
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
@@ -309,9 +293,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ;
 ; DAGISEL10-LABEL: set_inactive_chain_arg_call:
 ; DAGISEL10:       ; %bb.0:
-; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v43, v11
+; DAGISEL10-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v40, v10
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
@@ -347,10 +331,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ;
 ; GISEL11_W64-LABEL: set_inactive_chain_arg_call:
 ; GISEL11_W64:       ; %bb.0:
-; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL11_W64-NEXT:    s_mov_b32 s32, 0
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL11_W64-NEXT:    s_waitcnt expcnt(0)
 ; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v40, v10
 ; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -380,15 +364,13 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call:
 ; DAGISEL11_W64:       ; %bb.0:
-; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v43, v11
+; DAGISEL11_W64-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v40, v10
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -419,16 +401,14 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_call:
 ; GISEL10_W64:       ; %bb.0:
-; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL10_W64-NEXT:    s_mov_b32 s32, 0
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v42, v9
+; GISEL10_W64-NEXT:    s_waitcnt expcnt(0)
 ; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v40, v10
 ; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -463,9 +443,9 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ;
 ; DAGISEL10_W64-LABEL: set_inactive_chain_arg_call:
 ; DAGISEL10_W64:       ; %bb.0:
-; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_mov_b32 s32, 0
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v43, v11
+; DAGISEL10_W64-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v40, v10
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -512,10 +492,10 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %out, i32 %active, i32 %inactive) {
 ; GISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
 ; GISEL11:       ; %bb.0:
-; GISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL11-NEXT:    s_mov_b32 s32, 0
 ; GISEL11-NEXT:    v_dual_mov_b32 v41, v8 :: v_dual_mov_b32 v42, v9
 ; GISEL11-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL11-NEXT:    s_waitcnt expcnt(0)
 ; GISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GISEL11-NEXT:    v_mov_b32_e32 v40, v11
 ; GISEL11-NEXT:    s_mov_b32 exec_lo, s0
@@ -538,14 +518,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
 ; DAGISEL11:       ; %bb.0:
-; DAGISEL11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11-NEXT:    s_mov_b32 s32, 0
+; DAGISEL11-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL11-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v40, v11
 ; DAGISEL11-NEXT:    s_mov_b32 exec_lo, s0
@@ -570,17 +548,15 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
 ; GISEL10:       ; %bb.0:
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL10-NEXT:    s_mov_b32 s32, 0
 ; GISEL10-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL10-NEXT:    v_mov_b32_e32 v42, v9
 ; GISEL10-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL10-NEXT:    s_waitcnt expcnt(0)
 ; GISEL10-NEXT:    s_or_saveexec_b32 s0, -1
 ; GISEL10-NEXT:    v_mov_b32_e32 v40, v11
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s0
@@ -614,8 +590,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ;
 ; DAGISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
 ; DAGISEL10:       ; %bb.0:
-; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10-NEXT:    s_mov_b32 s32, 0
+; DAGISEL10-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL10-NEXT:    s_or_saveexec_b32 s0, -1
 ; DAGISEL10-NEXT:    v_mov_b32_e32 v40, v11
 ; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s0
@@ -652,11 +628,11 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ;
 ; GISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
 ; GISEL11_W64:       ; %bb.0:
-; GISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL11_W64-NEXT:    s_mov_b32 s32, 0
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v42, v9
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL11_W64-NEXT:    s_waitcnt expcnt(0)
 ; GISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v40, v11
 ; GISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -685,14 +661,12 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
 ; DAGISEL11_W64:       ; %bb.0:
-; DAGISEL11_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL11_W64-NEXT:    s_mov_b32 s32, 0
+; DAGISEL11_W64-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL11_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v40, v11
 ; DAGISEL11_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -724,17 +698,15 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
 ; GISEL10_W64:       ; %bb.0:
-; GISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL10_W64-NEXT:    s_mov_b32 s32, 0
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v41, v8
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v42, v9
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v43, v10
+; GISEL10_W64-NEXT:    s_waitcnt expcnt(0)
 ; GISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; GISEL10_W64-NEXT:    v_mov_b32_e32 v40, v11
 ; GISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
@@ -768,8 +740,8 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ;
 ; DAGISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
 ; DAGISEL10_W64:       ; %bb.0:
-; DAGISEL10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL10_W64-NEXT:    s_mov_b32 s32, 0
+; DAGISEL10_W64-NEXT:    s_waitcnt expcnt(0)
 ; DAGISEL10_W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
 ; DAGISEL10_W64-NEXT:    v_mov_b32_e32 v40, v11
 ; DAGISEL10_W64-NEXT:    s_mov_b64 exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
index bbfb88a4b22a361..f3c39b0d58ccbcf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll
@@ -5,8 +5,8 @@
 define half @v_sqrt_f16(half %src)  {
 ; GCN-LABEL: v_sqrt_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.amdgcn.sqrt.f16(half %src)
   ret half %sqrt
@@ -15,8 +15,8 @@ define half @v_sqrt_f16(half %src)  {
 define half @v_fabs_sqrt_f16(half %src)  {
 ; GCN-LABEL: v_fabs_sqrt_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f16_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %sqrt = call half @llvm.amdgcn.sqrt.f16(half %fabs.src)
@@ -26,8 +26,8 @@ define half @v_fabs_sqrt_f16(half %src)  {
 define half @v_fneg_fabs_sqrt_f16(half %src)  {
 ; GCN-LABEL: v_fneg_fabs_sqrt_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f16_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call half @llvm.fabs.f16(half %src)
   %neg.fabs.src = fneg half %fabs.src
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.ll
index 0257a3d111421dc..3512f33b934e9ac 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.ll
@@ -7,8 +7,8 @@
 define float @v_sqrt_f32(float %src)  {
 ; GCN-LABEL: v_sqrt_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.amdgcn.sqrt.f32(float %src)
   ret float %sqrt
@@ -17,8 +17,8 @@ define float @v_sqrt_f32(float %src)  {
 define float @v_fabs_sqrt_f32(float %src)  {
 ; GCN-LABEL: v_fabs_sqrt_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %sqrt = call float @llvm.amdgcn.sqrt.f32(float %fabs.src)
@@ -28,8 +28,8 @@ define float @v_fabs_sqrt_f32(float %src)  {
 define float @v_fneg_fabs_sqrt_f32(float %src)  {
 ; GCN-LABEL: v_fneg_fabs_sqrt_f32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %neg.fabs.src = fneg float %fabs.src
@@ -40,8 +40,8 @@ define float @v_fneg_fabs_sqrt_f32(float %src)  {
 define double @v_sqrt_f64(double %src)  {
 ; GCN-LABEL: v_sqrt_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
   ret double %sqrt
@@ -50,8 +50,8 @@ define double @v_sqrt_f64(double %src)  {
 define double @v_fabs_sqrt_f64(double %src)  {
 ; GCN-LABEL: v_fabs_sqrt_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], |v[0:1]|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %sqrt = call double @llvm.amdgcn.sqrt.f64(double %fabs.src)
@@ -61,8 +61,8 @@ define double @v_fabs_sqrt_f64(double %src)  {
 define double @v_fneg_fabs_sqrt_f64(double %src)  {
 ; GCN-LABEL: v_fneg_fabs_sqrt_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sqrt_f64_e64 v[0:1], -|v[0:1]|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %neg.fabs.src = fneg double %fabs.src
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
index 038f05e4faad7f0..5051a989d9a1069 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll
@@ -6,8 +6,8 @@ declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1
 define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -17,8 +17,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_us:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -28,8 +28,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_su:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -39,8 +39,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -52,8 +52,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -63,8 +63,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
@@ -74,8 +74,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -85,8 +85,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
index 27a227837d88f59..b085227edb378ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll
@@ -6,8 +6,8 @@ declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1
 define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -17,8 +17,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_us:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -28,8 +28,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_su:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0)
@@ -39,8 +39,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0)
@@ -52,8 +52,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -63,8 +63,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
@@ -74,8 +74,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1)
@@ -85,8 +85,8 @@ entry:
 define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
 entry:
   %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
index 34faa347c2f91a5..07fce6a1b3728c3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.ll
@@ -8,8 +8,8 @@ declare i64 @llvm.amdgcn.s.wqm.i64(i64)
 define i32 @test_s_wqm_constant_i32() {
 ; GFX11-LABEL: test_s_wqm_constant_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xff00ff0f
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 u0x85003A02)
   ret i32 %br
@@ -18,8 +18,8 @@ define i32 @test_s_wqm_constant_i32() {
 define i32 @test_s_wqm_constant_zero_i32() {
 ; GFX11-LABEL: test_s_wqm_constant_zero_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 0)
   ret i32 %br
@@ -28,8 +28,8 @@ define i32 @test_s_wqm_constant_zero_i32() {
 define i32 @test_s_wqm_constant_neg_one_i32() {
 ; GFX11-LABEL: test_s_wqm_constant_neg_one_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 -1)
   ret i32 %br
@@ -38,9 +38,9 @@ define i32 @test_s_wqm_constant_neg_one_i32() {
 define i32 @test_s_wqm_constant_undef_i32() {
 ; GFX11-LABEL: test_s_wqm_constant_undef_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_wqm_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 undef)
   ret i32 %br
@@ -49,9 +49,9 @@ define i32 @test_s_wqm_constant_undef_i32() {
 define i32 @test_s_wqm_constant_poison_i32() {
 ; GFX11-LABEL: test_s_wqm_constant_poison_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_wqm_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 poison)
   ret i32 %br
@@ -76,10 +76,10 @@ entry:
 define i32 @test_s_wqm_vgpr_i32(i32 %mask) {
 ; GFX11-LABEL: test_s_wqm_vgpr_i32:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_wqm_b32 s0, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i32 @llvm.amdgcn.s.wqm.i32(i32 %mask)
@@ -89,9 +89,9 @@ entry:
 define i64 @test_s_wqm_constant_i64() {
 ; GFX11-LABEL: test_s_wqm_constant_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0xff00ffff
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0xffff0fff
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 u0x12480FDBAC00753E)
   ret i64 %br
@@ -100,8 +100,8 @@ define i64 @test_s_wqm_constant_i64() {
 define i64 @test_s_wqm_constant_zero_i64() {
 ; GFX11-LABEL: test_s_wqm_constant_zero_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 0)
   ret i64 %br
@@ -110,8 +110,8 @@ define i64 @test_s_wqm_constant_zero_i64() {
 define i64 @test_s_wqm_constant_neg_one_i64() {
 ; GFX11-LABEL: test_s_wqm_constant_neg_one_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, -1 :: v_dual_mov_b32 v1, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 -1)
   ret i64 %br
@@ -120,9 +120,9 @@ define i64 @test_s_wqm_constant_neg_one_i64() {
 define i64 @test_s_wqm_constant_undef_i64() {
 ; GFX11-LABEL: test_s_wqm_constant_undef_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 undef)
   ret i64 %br
@@ -131,9 +131,9 @@ define i64 @test_s_wqm_constant_undef_i64() {
 define i64 @test_s_wqm_constant_poison_i64() {
 ; GFX11-LABEL: test_s_wqm_constant_poison_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 poison)
   ret i64 %br
@@ -157,11 +157,11 @@ entry:
 define i64 @test_s_wqm_vgpr_i64(i64 %mask) {
 ; GFX11-LABEL: test_s_wqm_vgpr_i64:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-NEXT:    s_wqm_b64 s[0:1], s[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %br = call i64 @llvm.amdgcn.s.wqm.i64(i64 %mask)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index c8570d6f279a6b2..a28a4377888b4f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -2537,7 +2537,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 define float @v_exp_f32(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -2559,11 +2558,11 @@ define float @v_exp_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -2585,11 +2584,11 @@ define float @v_exp_f32(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2608,11 +2607,11 @@ define float @v_exp_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -2631,11 +2630,11 @@ define float @v_exp_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2654,11 +2653,11 @@ define float @v_exp_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -2677,6 +2676,7 @@ define float @v_exp_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32:
@@ -2695,7 +2695,6 @@ define float @v_exp_f32(float %in) {
 define float @v_exp_fabs_f32(float %in) {
 ; VI-SDAG-LABEL: v_exp_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e64 v4, |v0|, v1
@@ -2718,11 +2717,11 @@ define float @v_exp_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e64 v2, |v0|, v1
@@ -2745,11 +2744,11 @@ define float @v_exp_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2768,11 +2767,11 @@ define float @v_exp_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e64 vcc, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, |v0|, v1, -v2
@@ -2791,11 +2790,11 @@ define float @v_exp_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2814,11 +2813,11 @@ define float @v_exp_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e64 vcc, |v0|, s4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v2, |v0|, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v1, |v0|, v1, -v2
@@ -2837,6 +2836,7 @@ define float @v_exp_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fabs_f32:
@@ -2856,7 +2856,6 @@ define float @v_exp_fabs_f32(float %in) {
 define float @v_exp_fneg_fabs_f32(float %in) {
 ; VI-SDAG-LABEL: v_exp_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, 0x80000000, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e64 v4, -|v0|, v1
@@ -2879,11 +2878,11 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fneg_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, 0x80000000, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e64 v2, -|v0|, v1
@@ -2906,11 +2905,11 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -|v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbfb8aa3b
 ; GFX900-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2929,11 +2928,11 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e64 vcc, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v1, -v2
@@ -2952,11 +2951,11 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -|v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbfb8aa3b
 ; SI-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s4
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -2975,11 +2974,11 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e64 vcc, |v0|, s4
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -|v0|, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v1, -v2
@@ -2998,6 +2997,7 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -|v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_fabs_f32:
@@ -3018,7 +3018,6 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 define float @v_exp_fneg_f32(float %in) {
 ; VI-SDAG-LABEL: v_exp_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-SDAG-NEXT:    v_sub_f32_e64 v4, -v0, v1
@@ -3041,11 +3040,11 @@ define float @v_exp_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fneg_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e64 v2, -v0, v1
@@ -3068,11 +3067,11 @@ define float @v_exp_fneg_f32(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0xbfb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0xbfb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3091,11 +3090,11 @@ define float @v_exp_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, -v0, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, -v0, v1, -v2
@@ -3114,11 +3113,11 @@ define float @v_exp_fneg_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0xbfb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xbfb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3137,11 +3136,11 @@ define float @v_exp_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -v0, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v1, -v0, v1, -v2
@@ -3160,6 +3159,7 @@ define float @v_exp_fneg_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_f32:
@@ -3179,7 +3179,6 @@ define float @v_exp_fneg_f32(float %in) {
 define float @v_exp_f32_fast(float %in) {
 ; GCN-SDAG-LABEL: v_exp_f32_fast:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3188,11 +3187,11 @@ define float @v_exp_f32_fast(float %in) {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_f32_fast:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3201,11 +3200,11 @@ define float @v_exp_f32_fast(float %in) {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3214,11 +3213,11 @@ define float @v_exp_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3227,6 +3226,7 @@ define float @v_exp_f32_fast(float %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_fast:
@@ -3245,7 +3245,6 @@ define float @v_exp_f32_fast(float %in) {
 define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-SDAG-LABEL: v_exp_f32_unsafe_math_attr:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3254,11 +3253,11 @@ define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_f32_unsafe_math_attr:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3267,11 +3266,11 @@ define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3280,11 +3279,11 @@ define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_unsafe_math_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3293,6 +3292,7 @@ define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_unsafe_math_attr:
@@ -3311,7 +3311,6 @@ define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GCN-SDAG-LABEL: v_exp_f32_approx_fn_attr:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3320,11 +3319,11 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_f32_approx_fn_attr:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3333,11 +3332,11 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_approx_fn_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3346,11 +3345,11 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_approx_fn_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3359,6 +3358,7 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_approx_fn_attr:
@@ -3377,7 +3377,6 @@ define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 define float @v_exp_f32_ninf(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -3395,11 +3394,11 @@ define float @v_exp_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -3417,11 +3416,11 @@ define float @v_exp_f32_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3436,11 +3435,11 @@ define float @v_exp_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -3455,11 +3454,11 @@ define float @v_exp_f32_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3474,11 +3473,11 @@ define float @v_exp_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -3493,6 +3492,7 @@ define float @v_exp_f32_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_ninf:
@@ -3511,7 +3511,6 @@ define float @v_exp_f32_ninf(float %in) {
 define float @v_exp_f32_afn(float %in) {
 ; GCN-SDAG-LABEL: v_exp_f32_afn:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3520,11 +3519,11 @@ define float @v_exp_f32_afn(float %in) {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_f32_afn:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3533,11 +3532,11 @@ define float @v_exp_f32_afn(float %in) {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3546,11 +3545,11 @@ define float @v_exp_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3559,6 +3558,7 @@ define float @v_exp_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_afn:
@@ -3577,16 +3577,16 @@ define float @v_exp_f32_afn(float %in) {
 define float @v_exp_f32_afn_daz(float %in) #0 {
 ; GCN-LABEL: v_exp_f32_afn_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: v_exp_f32_afn_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-NEXT:    v_exp_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_afn_daz:
@@ -3605,7 +3605,6 @@ define float @v_exp_f32_afn_daz(float %in) #0 {
 define float @v_exp_f32_afn_dynamic(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp_f32_afn_dynamic:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3614,11 +3613,11 @@ define float @v_exp_f32_afn_dynamic(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_f32_afn_dynamic:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3627,11 +3626,11 @@ define float @v_exp_f32_afn_dynamic(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_afn_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, 0x42800000, v0
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3640,11 +3639,11 @@ define float @v_exp_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_afn_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x42800000, v0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3653,6 +3652,7 @@ define float @v_exp_f32_afn_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_afn_dynamic:
@@ -3671,7 +3671,6 @@ define float @v_exp_f32_afn_dynamic(float %in) #1 {
 define float @v_fabs_exp_f32_afn(float %in) {
 ; GCN-SDAG-LABEL: v_fabs_exp_f32_afn:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; GCN-SDAG-NEXT:    s_mov_b32 s5, 0x42800000
 ; GCN-SDAG-NEXT:    v_add_f32_e64 v1, |v0|, s5
@@ -3681,11 +3680,11 @@ define float @v_fabs_exp_f32_afn(float %in) {
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_fabs_exp_f32_afn:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_add_f32_e64 v2, |v0|, v2
@@ -3695,11 +3694,11 @@ define float @v_fabs_exp_f32_afn(float %in) {
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_fabs_exp_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x42800000
 ; SI-SDAG-NEXT:    v_add_f32_e64 v1, |v0|, s5
@@ -3709,11 +3708,11 @@ define float @v_fabs_exp_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_fabs_exp_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2aeac50
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; SI-GISEL-NEXT:    v_add_f32_e64 v2, |v0|, v2
@@ -3723,6 +3722,7 @@ define float @v_fabs_exp_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fabs_exp_f32_afn:
@@ -3742,7 +3742,6 @@ define float @v_fabs_exp_f32_afn(float %in) {
 define float @v_exp_f32_daz(float %in) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -3764,11 +3763,11 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -3790,11 +3789,11 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3813,11 +3812,11 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -3836,11 +3835,11 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3859,11 +3858,11 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -3882,6 +3881,7 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_daz:
@@ -3900,7 +3900,6 @@ define float @v_exp_f32_daz(float %in) #0 {
 define float @v_exp_f32_nnan(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -3922,11 +3921,11 @@ define float @v_exp_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -3948,11 +3947,11 @@ define float @v_exp_f32_nnan(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -3971,11 +3970,11 @@ define float @v_exp_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -3994,11 +3993,11 @@ define float @v_exp_f32_nnan(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4017,11 +4016,11 @@ define float @v_exp_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4040,6 +4039,7 @@ define float @v_exp_f32_nnan(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan:
@@ -4058,7 +4058,6 @@ define float @v_exp_f32_nnan(float %in) {
 define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_nnan_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4080,11 +4079,11 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4106,11 +4105,11 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4129,11 +4128,11 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4152,11 +4151,11 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4175,11 +4174,11 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4198,6 +4197,7 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan_daz:
@@ -4216,7 +4216,6 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-LABEL: v_exp_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4238,11 +4237,11 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4264,11 +4263,11 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4287,11 +4286,11 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4310,11 +4309,11 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4333,11 +4332,11 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4356,6 +4355,7 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan_dynamic:
@@ -4374,7 +4374,6 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_ninf_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4392,11 +4391,11 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_ninf_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4414,11 +4413,11 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4433,11 +4432,11 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4452,11 +4451,11 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4471,11 +4470,11 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4490,6 +4489,7 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_ninf_daz:
@@ -4508,7 +4508,6 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-LABEL: v_exp_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4526,11 +4525,11 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4548,11 +4547,11 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4567,11 +4566,11 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4586,11 +4585,11 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4605,11 +4604,11 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4624,6 +4623,7 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_ninf_dynamic:
@@ -4642,7 +4642,6 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 define float @v_exp_f32_nnan_ninf(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4660,11 +4659,11 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4682,11 +4681,11 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4701,11 +4700,11 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4720,11 +4719,11 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4739,11 +4738,11 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4758,6 +4757,7 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan_ninf:
@@ -4776,7 +4776,6 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_nnan_ninf_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4794,11 +4793,11 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan_ninf_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4816,11 +4815,11 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4835,11 +4834,11 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4854,11 +4853,11 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4873,11 +4872,11 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4892,6 +4891,7 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan_ninf_daz:
@@ -4910,7 +4910,6 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -4928,11 +4927,11 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -4950,11 +4949,11 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -4969,11 +4968,11 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -4988,11 +4987,11 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -5007,11 +5006,11 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -5026,6 +5025,7 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_nnan_ninf_dynamic:
@@ -5044,16 +5044,16 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 define float @v_exp_f32_fast_daz(float %in) #0 {
 ; GCN-LABEL: v_exp_f32_fast_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: v_exp_f32_fast_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-NEXT:    v_exp_f32_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_fast_daz:
@@ -5072,7 +5072,6 @@ define float @v_exp_f32_fast_daz(float %in) #0 {
 define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-LABEL: v_exp_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -5094,11 +5093,11 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_dynamic_mode:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -5120,11 +5119,11 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -5143,11 +5142,11 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_dynamic_mode:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -5166,11 +5165,11 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_dynamic_mode:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -5189,11 +5188,11 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_dynamic_mode:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -5212,6 +5211,7 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_dynamic_mode:
@@ -5230,18 +5230,17 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 define float @v_exp_f32_undef() {
 ; VI-SDAG-LABEL: v_exp_f32_undef:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_rndne_f32_e32 v0, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7fc00000
 ; VI-SDAG-NEXT:    v_add_f32_e64 v1, -v0, s4
 ; VI-SDAG-NEXT:    v_exp_f32_e32 v1, v1
 ; VI-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_undef:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_sub_f32_e64 v0, s4, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8a000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x39a3b295
@@ -5264,11 +5263,11 @@ define float @v_exp_f32_undef() {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_undef:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
@@ -5280,11 +5279,11 @@ define float @v_exp_f32_undef() {
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_undef:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
@@ -5303,11 +5302,11 @@ define float @v_exp_f32_undef() {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_undef:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0xffc00000
 ; SI-SDAG-NEXT:    v_fma_f32 v0, s4, v0, v1
@@ -5319,11 +5318,11 @@ define float @v_exp_f32_undef() {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_undef:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
@@ -5342,6 +5341,7 @@ define float @v_exp_f32_undef() {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_undef:
@@ -5360,14 +5360,14 @@ define float @v_exp_f32_undef() {
 define float @v_exp_f32_0() {
 ; GCN-LABEL: v_exp_f32_0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: v_exp_f32_0:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_0:
@@ -5386,7 +5386,6 @@ define float @v_exp_f32_0() {
 define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; VI-SDAG-LABEL: v_exp_f32_from_fpext_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5409,11 +5408,11 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_from_fpext_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -5436,11 +5435,11 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_from_fpext_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x32a5705f
@@ -5460,11 +5459,11 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_from_fpext_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
@@ -5484,11 +5483,11 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_from_fpext_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x32a5705f
@@ -5508,11 +5507,11 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_from_fpext_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
@@ -5532,6 +5531,7 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_from_fpext_f16:
@@ -5552,7 +5552,6 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
@@ -5576,11 +5575,11 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5604,11 +5603,11 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_from_fpext_math_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
@@ -5629,11 +5628,11 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_from_fpext_math_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
@@ -5654,11 +5653,11 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
@@ -5680,11 +5679,11 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
@@ -5708,6 +5707,7 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_from_fpext_math_f16:
@@ -5730,7 +5730,6 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; VI-LABEL: v_exp_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -5752,11 +5751,11 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-NEXT:    v_rndne_f32_e32 v2, v1
@@ -5775,11 +5774,11 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: v_exp_f32_from_fpext_bf16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-NEXT:    v_rndne_f32_e32 v2, v1
@@ -5798,6 +5797,7 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_from_fpext_bf16:
@@ -5817,16 +5817,15 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
 define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) {
 ; GCN-LABEL: v_exp_f32_from_fpext_math_f16_fast:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2aeac50
@@ -5838,11 +5837,11 @@ define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x114b4ea4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -5850,6 +5849,7 @@ define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_from_fpext_math_f16_fast:
@@ -5872,7 +5872,6 @@ define float @v_exp_f32_from_fpext_math_f16_fast(i16 %src0.i, i16 %src1.i) {
 define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
@@ -5896,11 +5895,11 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5924,11 +5923,11 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
@@ -5949,11 +5948,11 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
@@ -5974,11 +5973,11 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
@@ -6000,11 +5999,11 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_from_fpext_math_f16_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
@@ -6028,6 +6027,7 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_from_fpext_math_f16_daz:
@@ -6051,31 +6051,31 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 define half @v_exp_f16(half %in) {
 ; GCN-LABEL: v_exp_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f16:
@@ -6094,31 +6094,31 @@ define half @v_exp_f16(half %in) {
 define half @v_exp_fabs_f16(half %in) {
 ; GCN-LABEL: v_exp_fabs_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fabs_f16:
@@ -6138,40 +6138,40 @@ define half @v_exp_fabs_f16(half %in) {
 define half @v_exp_fneg_fabs_f16(half %in) {
 ; GCN-SDAG-LABEL: v_exp_fneg_fabs_f16:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0xbfb8aa3b, v0
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_fneg_fabs_f16:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0xbfb8aa3b, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_fabs_f16:
@@ -6192,40 +6192,40 @@ define half @v_exp_fneg_fabs_f16(half %in) {
 define half @v_exp_fneg_f16(half %in) {
 ; GCN-SDAG-LABEL: v_exp_fneg_f16:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, 0xbfb8aa3b, v0
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp_fneg_f16:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GCN-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GCN-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_f16:
@@ -6245,25 +6245,24 @@ define half @v_exp_fneg_f16(half %in) {
 define half @v_exp_f16_fast(half %in) {
 ; GCN-LABEL: v_exp_f16_fast:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, 0x3dc5, v0
 ; GCN-NEXT:    v_exp_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8a000, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, 0x3dc5
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -6271,6 +6270,7 @@ define half @v_exp_f16_fast(half %in) {
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f16_fast:
@@ -6289,7 +6289,6 @@ define half @v_exp_f16_fast(half %in) {
 define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6299,11 +6298,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6313,11 +6312,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp_v2f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6327,11 +6326,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6344,11 +6343,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
@@ -6357,6 +6356,7 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v2f16:
@@ -6375,7 +6375,6 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6385,11 +6384,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6400,11 +6399,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6414,11 +6413,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6429,11 +6428,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
@@ -6446,11 +6445,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6464,6 +6463,7 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fabs_v2f16:
@@ -6483,7 +6483,6 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_fneg_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6493,11 +6492,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fneg_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6508,11 +6507,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6522,11 +6521,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6537,11 +6536,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6558,11 +6557,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6576,6 +6575,7 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6596,7 +6596,6 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_fneg_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6606,11 +6605,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_fneg_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6621,11 +6620,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
@@ -6635,11 +6634,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6650,11 +6649,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6671,11 +6670,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_fneg_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6689,6 +6688,7 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_fneg_v2f16:
@@ -6708,50 +6708,49 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_v2f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3dc5
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x3dc5, v0
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3dc5
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3dc5
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
 ; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v2f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6764,11 +6763,11 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, 0x3dc5
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -6782,6 +6781,7 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v2f16_fast:
@@ -6800,7 +6800,6 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; VI-LABEL: v_exp_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -6814,11 +6813,11 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp_v3f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -6832,11 +6831,11 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v3f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -6855,11 +6854,11 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_v3f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -6872,6 +6871,7 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v3f16:
@@ -6890,7 +6890,6 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; VI-SDAG-LABEL: v_exp_v3f16_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3dc5
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6899,11 +6898,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_v3f16_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3dc5
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x3dc5, v0
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6912,11 +6911,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
 ; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6925,11 +6924,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3dc5
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x3dc5, v0
 ; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -6938,11 +6937,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v3f16_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -6961,11 +6960,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_v3f16_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v3, 0x3dc5
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -6985,6 +6984,7 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v3f16_afn:
@@ -7003,7 +7003,6 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 define float @v_exp_f32_contract(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32_contract:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -7025,11 +7024,11 @@ define float @v_exp_f32_contract(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_contract:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -7051,11 +7050,11 @@ define float @v_exp_f32_contract(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_contract:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7074,11 +7073,11 @@ define float @v_exp_f32_contract(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_contract:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7097,11 +7096,11 @@ define float @v_exp_f32_contract(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_contract:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7120,11 +7119,11 @@ define float @v_exp_f32_contract(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_contract:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7143,6 +7142,7 @@ define float @v_exp_f32_contract(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_contract:
@@ -7161,7 +7161,6 @@ define float @v_exp_f32_contract(float %in) {
 define float @v_exp_f32_contract_daz(float %in) #0 {
 ; VI-SDAG-LABEL: v_exp_f32_contract_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -7183,11 +7182,11 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_contract_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -7209,11 +7208,11 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_contract_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7232,11 +7231,11 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_contract_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7255,11 +7254,11 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_contract_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7278,11 +7277,11 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-SDAG-NEXT:    v_cmp_nlt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_contract_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7301,6 +7300,7 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_contract_daz:
@@ -7319,7 +7319,6 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; VI-SDAG-LABEL: v_exp_f32_contract_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8a000, v1
@@ -7337,11 +7336,11 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_f32_contract_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x39a3b295, v2
@@ -7359,11 +7358,11 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_f32_contract_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; GFX900-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7378,11 +7377,11 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_ldexp_f32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_f32_contract_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7397,11 +7396,11 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_f32_contract_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
 ; SI-SDAG-NEXT:    v_rndne_f32_e32 v2, v1
@@ -7416,11 +7415,11 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_cmp_ngt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_f32_contract_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
@@ -7435,6 +7434,7 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_f32_contract_nnan_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 587109b9431a1a5..cad798e0fc889b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -974,7 +974,6 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 define float @v_exp2_f32(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -984,11 +983,11 @@ define float @v_exp2_f32(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -998,6 +997,7 @@ define float @v_exp2_f32(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32:
@@ -1016,7 +1016,6 @@ define float @v_exp2_f32(float %in) {
 define float @v_exp2_fabs_f32(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_fabs_f32:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1026,11 +1025,11 @@ define float @v_exp2_fabs_f32(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_fabs_f32:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -1040,6 +1039,7 @@ define float @v_exp2_fabs_f32(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fabs_f32:
@@ -1059,7 +1059,6 @@ define float @v_exp2_fabs_f32(float %in) {
 define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_fneg_fabs_f32:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0x42fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1069,11 +1068,11 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_fneg_fabs_f32:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -1083,6 +1082,7 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_fabs_f32:
@@ -1103,7 +1103,6 @@ define float @v_exp2_fneg_fabs_f32(float %in) {
 define float @v_exp2_fneg_f32(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_fneg_f32:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0x42fc0000
 ; GCN-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1113,11 +1112,11 @@ define float @v_exp2_fneg_f32(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_fneg_f32:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -1127,6 +1126,7 @@ define float @v_exp2_fneg_f32(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_f32:
@@ -1146,7 +1146,6 @@ define float @v_exp2_fneg_f32(float %in) {
 define float @v_exp2_f32_fast(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_fast:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1156,11 +1155,11 @@ define float @v_exp2_f32_fast(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_fast:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1170,6 +1169,7 @@ define float @v_exp2_f32_fast(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_fast:
@@ -1188,7 +1188,6 @@ define float @v_exp2_f32_fast(float %in) {
 define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1198,11 +1197,11 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_unsafe_math_attr:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1212,6 +1211,7 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_unsafe_math_attr:
@@ -1230,7 +1230,6 @@ define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GCN-SDAG-LABEL: v_exp2_f32_approx_fn_attr:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1240,11 +1239,11 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_approx_fn_attr:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1254,6 +1253,7 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_approx_fn_attr:
@@ -1272,7 +1272,6 @@ define float @v_exp2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 define float @v_exp2_f32_ninf(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_ninf:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1282,11 +1281,11 @@ define float @v_exp2_f32_ninf(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_ninf:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1296,6 +1295,7 @@ define float @v_exp2_f32_ninf(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_ninf:
@@ -1314,7 +1314,6 @@ define float @v_exp2_f32_ninf(float %in) {
 define float @v_exp2_f32_afn(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_afn:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1324,11 +1323,11 @@ define float @v_exp2_f32_afn(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_afn:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1338,6 +1337,7 @@ define float @v_exp2_f32_afn(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_afn:
@@ -1356,8 +1356,8 @@ define float @v_exp2_f32_afn(float %in) {
 define float @v_exp2_f32_afn_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_afn_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_afn_daz:
@@ -1376,7 +1376,6 @@ define float @v_exp2_f32_afn_daz(float %in) #0 {
 define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp2_f32_afn_dynamic:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1386,11 +1385,11 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_afn_dynamic:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1400,6 +1399,7 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_afn_dynamic:
@@ -1418,7 +1418,6 @@ define float @v_exp2_f32_afn_dynamic(float %in) #1 {
 define float @v_fabs_exp2_f32_afn(float %in) {
 ; GCN-SDAG-LABEL: v_fabs_exp2_f32_afn:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1428,11 +1427,11 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_fabs_exp2_f32_afn:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -1442,6 +1441,7 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fabs_exp2_f32_afn:
@@ -1461,8 +1461,8 @@ define float @v_fabs_exp2_f32_afn(float %in) {
 define float @v_exp2_f32_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_daz:
@@ -1481,7 +1481,6 @@ define float @v_exp2_f32_daz(float %in) #0 {
 define float @v_exp2_f32_nnan(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_nnan:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1491,11 +1490,11 @@ define float @v_exp2_f32_nnan(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_nnan:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1505,6 +1504,7 @@ define float @v_exp2_f32_nnan(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan:
@@ -1523,8 +1523,8 @@ define float @v_exp2_f32_nnan(float %in) {
 define float @v_exp2_f32_nnan_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_nnan_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan_daz:
@@ -1543,7 +1543,6 @@ define float @v_exp2_f32_nnan_daz(float %in) #0 {
 define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp2_f32_nnan_dynamic:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1553,11 +1552,11 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_nnan_dynamic:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1567,6 +1566,7 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan_dynamic:
@@ -1585,8 +1585,8 @@ define float @v_exp2_f32_nnan_dynamic(float %in) #1 {
 define float @v_exp2_f32_ninf_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_ninf_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_ninf_daz:
@@ -1605,7 +1605,6 @@ define float @v_exp2_f32_ninf_daz(float %in) #0 {
 define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp2_f32_ninf_dynamic:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1615,11 +1614,11 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_ninf_dynamic:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1629,6 +1628,7 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_ninf_dynamic:
@@ -1647,7 +1647,6 @@ define float @v_exp2_f32_ninf_dynamic(float %in) #1 {
 define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1657,11 +1656,11 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1671,6 +1670,7 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan_ninf:
@@ -1689,8 +1689,8 @@ define float @v_exp2_f32_nnan_ninf(float %in) {
 define float @v_exp2_f32_nnan_ninf_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_nnan_ninf_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan_ninf_daz:
@@ -1709,7 +1709,6 @@ define float @v_exp2_f32_nnan_ninf_daz(float %in) #0 {
 define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1719,11 +1718,11 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_nnan_ninf_dynamic:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1733,6 +1732,7 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_nnan_ninf_dynamic:
@@ -1751,8 +1751,8 @@ define float @v_exp2_f32_nnan_ninf_dynamic(float %in) #1 {
 define float @v_exp2_f32_fast_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_fast_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_fast_daz:
@@ -1771,7 +1771,6 @@ define float @v_exp2_f32_fast_daz(float %in) #0 {
 define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GCN-SDAG-LABEL: v_exp2_f32_dynamic_mode:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1781,11 +1780,11 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_dynamic_mode:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1795,6 +1794,7 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_dynamic_mode:
@@ -1813,13 +1813,12 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 {
 define float @v_exp2_f32_undef() {
 ; GCN-SDAG-LABEL: v_exp2_f32_undef:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    v_exp_f32_e32 v0, 0x7fc00000
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_undef:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v1, s4, v1
@@ -1830,6 +1829,7 @@ define float @v_exp2_f32_undef() {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_undef:
@@ -1848,8 +1848,8 @@ define float @v_exp2_f32_undef() {
 define float @v_exp2_f32_0() {
 ; GCN-LABEL: v_exp2_f32_0:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_0:
@@ -1878,9 +1878,9 @@ define float @v_exp2_f32_0() {
 define float @v_exp2_f32_from_fpext_f16(i16 %src.i) {
 ; GCN-LABEL: v_exp2_f32_from_fpext_f16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_from_fpext_f16:
@@ -1901,7 +1901,6 @@ define float @v_exp2_f32_from_fpext_f16(i16 %src.i) {
 define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-LABEL: v_exp2_f32_from_fpext_math_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
@@ -1914,33 +1913,34 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_f32_from_fpext_math_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_f32_from_fpext_math_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_exp_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_f32_from_fpext_math_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_exp_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_from_fpext_math_f16:
@@ -1970,7 +1970,6 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
 ; GCN-LABEL: v_exp2_f32_from_fpext_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -1980,6 +1979,7 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_from_fpext_bf16:
@@ -2011,32 +2011,32 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
 define half @v_exp2_f16(half %in) {
 ; SI-SDAG-LABEL: v_exp2_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_exp_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f16:
@@ -2060,32 +2060,32 @@ define half @v_exp2_f16(half %in) {
 define half @v_exp2_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_exp2_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_exp_f16_e64 v0, |v0|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_exp_f16_e64 v0, |v0|
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fabs_f16:
@@ -2110,32 +2110,32 @@ define half @v_exp2_fabs_f16(half %in) {
 define half @v_exp2_fneg_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_exp2_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_fneg_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_exp_f16_e64 v0, -|v0|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_fneg_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_exp_f16_e64 v0, -|v0|
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_fabs_f16:
@@ -2161,32 +2161,32 @@ define half @v_exp2_fneg_fabs_f16(half %in) {
 define half @v_exp2_fneg_f16(half %in) {
 ; SI-SDAG-LABEL: v_exp2_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fneg_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_exp_f16_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_fneg_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_exp_f16_e64 v0, -v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_f16:
@@ -2211,32 +2211,32 @@ define half @v_exp2_fneg_f16(half %in) {
 define half @v_exp2_f16_fast(half %in) {
 ; SI-SDAG-LABEL: v_exp2_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_exp2_f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_exp_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_exp2_f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f16_fast:
@@ -2260,7 +2260,6 @@ define half @v_exp2_f16_fast(half %in) {
 define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -2271,49 +2270,50 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_v2f16:
@@ -2332,7 +2332,6 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
@@ -2343,11 +2342,11 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2359,40 +2358,41 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e64 v0, |v0|
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, |v0|
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fabs_v2f16:
@@ -2412,7 +2412,6 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2427,11 +2426,11 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2443,40 +2442,41 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e64 v0, -|v0|
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -|v0|
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -2497,7 +2497,6 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -2512,11 +2511,11 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_fneg_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2528,40 +2527,41 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e64 v0, -v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_fneg_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_fneg_v2f16:
@@ -2581,7 +2581,6 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_v2f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -2592,49 +2591,50 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_v2f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_v2f16_fast:
@@ -2653,7 +2653,6 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_exp_v3f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2669,11 +2668,11 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp_v3f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -2683,42 +2682,43 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp_v3f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp_v3f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp_v3f16:
@@ -2737,7 +2737,6 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_exp2_v3f16_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2753,11 +2752,11 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_exp2_v3f16_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -2767,42 +2766,43 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_exp2_v3f16_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_exp2_v3f16_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_v3f16_afn:
@@ -2821,7 +2821,6 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 define float @v_exp2_f32_contract(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_contract:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -2831,11 +2830,11 @@ define float @v_exp2_f32_contract(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_contract:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2845,6 +2844,7 @@ define float @v_exp2_f32_contract(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_contract:
@@ -2863,8 +2863,8 @@ define float @v_exp2_f32_contract(float %in) {
 define float @v_exp2_f32_contract_daz(float %in) #0 {
 ; GCN-LABEL: v_exp2_f32_contract_daz:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_exp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_contract_daz:
@@ -2883,7 +2883,6 @@ define float @v_exp2_f32_contract_daz(float %in) #0 {
 define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GCN-SDAG-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_mov_b32 s4, 0xc2fc0000
 ; GCN-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v2, 0x42800000
@@ -2893,11 +2892,11 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: v_exp2_f32_contract_nnan_ninf:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GCN-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2907,6 +2906,7 @@ define float @v_exp2_f32_contract_nnan_ninf(float %in) {
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_exp2_f32_contract_nnan_ninf:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 2c07b47bf1ed591..bc664613478c6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -12,7 +12,6 @@
 define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -21,38 +20,38 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -61,6 +60,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   ret { half, i32 } %result
@@ -69,42 +69,42 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 0
@@ -114,45 +114,45 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 1
@@ -162,7 +162,6 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -178,11 +177,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-SDAG-NEXT:    v_frexp_mant_f16_e32 v1, v0
 ; GFX8-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
@@ -192,11 +191,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v1
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v3, v0
@@ -206,11 +205,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
@@ -221,11 +220,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v3
 ; GFX11-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -241,11 +240,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
@@ -254,11 +253,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v2, v2
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
@@ -267,6 +266,7 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v2, v2
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   ret { <2 x half>, <2 x i32> } %result
@@ -275,7 +275,6 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -287,37 +286,37 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f16_e32 v1, v0
 ; GFX8-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
@@ -329,14 +328,15 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   %result.0 = extractvalue { <2 x half>, <2 x i32> } %result, 0
@@ -346,7 +346,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -358,29 +357,29 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX8-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -388,11 +387,11 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
@@ -402,26 +401,27 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v1, 0, 16
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   %result.1 = extractvalue { <2 x half>, <2 x i32> } %result, 1
@@ -431,7 +431,6 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i16:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -440,36 +439,36 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
@@ -478,6 +477,7 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   ret { half, i16 } %result
@@ -486,42 +486,42 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 define half @test_frexp_f16_i16_only_use_fract(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 0
@@ -531,41 +531,41 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
 define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 1
@@ -592,7 +592,6 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f32_i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -600,42 +599,43 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f32_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f32_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   ret { float, i32 } %result
@@ -644,38 +644,38 @@ define { float, i32 } @test_frexp_f32_i32(float %a) {
 define float @test_frexp_f32_i32_only_use_fract(float %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 0
@@ -685,38 +685,38 @@ define float @test_frexp_f32_i32_only_use_fract(float %a) {
 define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f32_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v1, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { float, i32 } @llvm.frexp.f32.i32(float %a)
   %result.0 = extractvalue { float, i32 } %result, 1
@@ -726,7 +726,6 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) {
 define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f32_v2i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -740,44 +739,44 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[4:5]
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_frexp_v2f32_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_frexp_mant_f32_e32 v4, v0
 ; GFX8-SDAG-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX8-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX8-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_frexp_v2f32_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f32_e32 v4, v0
 ; GFX9-SDAG-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f32_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v4, v0
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
@@ -789,28 +788,29 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_frexp_mant_f32_e32 v5, v0
 ; GFX8-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX8-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f32_e32 v5, v0
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   ret { <2 x float>, <2 x i32> } %result
@@ -819,7 +819,6 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -827,32 +826,32 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX8-NEXT:    v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
@@ -860,6 +859,7 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.0 = extractvalue { <2 x float>, <2 x i32> } %result, 0
@@ -869,7 +869,6 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -877,32 +876,32 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
@@ -910,6 +909,7 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.1 = extractvalue { <2 x float>, <2 x i32> } %result, 1
@@ -919,7 +919,6 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f64_i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
@@ -929,38 +928,38 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v0, v3
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX8-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f64_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f64_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
@@ -969,6 +968,7 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   ret { double, i32 } %result
@@ -977,42 +977,42 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 define double @test_frexp_f64_i32_only_use_fract(double %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[0:1]
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[4:5], v[0:1]
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 0
@@ -1022,40 +1022,40 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
 define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 1
@@ -1065,7 +1065,6 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
@@ -1080,11 +1079,11 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[4:5]
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[4:5]
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f64_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[8:9], v[0:1]
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
 ; GFX8-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
@@ -1093,11 +1092,11 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_v2f64_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[8:9], v[0:1]
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
 ; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
@@ -1106,11 +1105,11 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f64_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[8:9], v[0:1]
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[6:7], v[2:3]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
@@ -1118,11 +1117,11 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
 ; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[5:6], v[0:1]
@@ -1137,6 +1136,7 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   ret { <2 x double>, <2 x i32> } %result
@@ -1145,7 +1145,6 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) {
 define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 ; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0
 ; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-NEXT:    v_frexp_mant_f64_e32 v[4:5], v[0:1]
@@ -1156,27 +1155,28 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 ; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX9-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f64_v2i32_only_use_fract:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0
@@ -1186,7 +1186,6 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) {
 define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0
 ; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
 ; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
@@ -1195,27 +1194,28 @@ define <2 x i32> @test_frexp_v2f64_v2i32_only_use_exp(<2 x double> %a) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX8-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_frexp_v2f64_v2i32_only_use_exp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v1, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)
   %result.1 = extractvalue { <2 x double>, <2 x i32> } %result, 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
index a665e58adb8a1b1..a42870e67be341f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
@@ -11,34 +11,34 @@ declare i32 @llvm.get.fpmode.i32()
 define i32 @func_fpmode_i32() {
 ; GFX678-LABEL: func_fpmode_i32:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   ret i32 %fpmode
@@ -47,34 +47,34 @@ define i32 @func_fpmode_i32() {
 define i32 @strictfp_func_fpmode_i32() strictfp {
 ; GFX678-LABEL: strictfp_func_fpmode_i32:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: strictfp_func_fpmode_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: strictfp_func_fpmode_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: strictfp_func_fpmode_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   ret i32 %fpmode
@@ -160,38 +160,38 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) {
 define i32 @func_fpmode_i32_denormonly() {
 ; GFX678-LABEL: func_fpmode_i32_denormonly:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0xf0
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_denormonly:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xf0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_denormonly:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xf0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_denormonly:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xf0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %denorm.only = and i32 %fpmode, 240
@@ -201,38 +201,38 @@ define i32 @func_fpmode_i32_denormonly() {
 define i32 @func_fpmode_i32_roundonly() {
 ; GFX678-LABEL: func_fpmode_i32_roundonly:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_roundonly:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_roundonly:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 15
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_roundonly:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 15
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %round.only = and i32 %fpmode, 15
@@ -242,38 +242,38 @@ define i32 @func_fpmode_i32_roundonly() {
 define i32 @func_fpmode_i32_round_denorm_only() {
 ; GFX678-LABEL: func_fpmode_i32_round_denorm_only:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_round_denorm_only:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_round_denorm_only:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_round_denorm_only:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %round.denorm.only = and i32 %fpmode, 255
@@ -283,38 +283,38 @@ define i32 @func_fpmode_i32_round_denorm_only() {
 define i32 @func_fpmode_i32_round_denorm_dx10_ieee() {
 ; GFX678-LABEL: func_fpmode_i32_round_denorm_dx10_ieee:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x3ff
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_round_denorm_dx10_ieee:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x3ff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_round_denorm_dx10_ieee:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x3ff
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_round_denorm_dx10_ieee:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x3ff
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %core.mode = and i32 %fpmode, 1023
@@ -324,38 +324,38 @@ define i32 @func_fpmode_i32_round_denorm_dx10_ieee() {
 define i32 @func_fpmode_i32_excp_en() {
 ; GFX678-LABEL: func_fpmode_i32_excp_en:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x7f000
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_excp_en:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x7f000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_excp_en:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x7f000
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_excp_en:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x7f000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %core.mode = and i32 %fpmode, 520192
@@ -366,38 +366,38 @@ define i32 @func_fpmode_i32_excp_en() {
 define i32 @func_fpmode_i32_environment_gfx6() {
 ; GFX678-LABEL: func_fpmode_i32_environment_gfx6:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x7f3ff
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_environment_gfx6:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x7f3ff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_environment_gfx6:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x7f3ff
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_environment_gfx6:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x7f3ff
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %core.mode = and i32 %fpmode, 521215
@@ -408,38 +408,38 @@ define i32 @func_fpmode_i32_environment_gfx6() {
 define i32 @func_fpmode_i32_environment_gfx9() {
 ; GFX678-LABEL: func_fpmode_i32_environment_gfx9:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x87f3ff
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_environment_gfx9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x87f3ff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_environment_gfx9:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x87f3ff
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_environment_gfx9:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x87f3ff
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %core.mode = and i32 %fpmode, 8909823
@@ -449,38 +449,38 @@ define i32 @func_fpmode_i32_environment_gfx9() {
 define i32 @func_fpmode_i32_denormf32only() {
 ; GFX678-LABEL: func_fpmode_i32_denormf32only:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 48
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_denormf32only:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 48
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_denormf32only:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 48
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_denormf32only:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 48
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %denorm.only = and i32 %fpmode, 48
@@ -490,38 +490,38 @@ define i32 @func_fpmode_i32_denormf32only() {
 define i32 @func_fpmode_i32_denormf32only_0() {
 ; GFX678-LABEL: func_fpmode_i32_denormf32only_0:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 32
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_denormf32only_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 32
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_denormf32only_0:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 32
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_denormf32only_0:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %denorm.only = and i32 %fpmode, 32
@@ -531,38 +531,38 @@ define i32 @func_fpmode_i32_denormf32only_0() {
 define i32 @func_fpmode_i32_denormf32only_1() {
 ; GFX678-LABEL: func_fpmode_i32_denormf32only_1:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 64
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_denormf32only_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 64
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_denormf32only_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 64
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_denormf32only_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 64
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %denorm.only = and i32 %fpmode, 64
@@ -572,38 +572,38 @@ define i32 @func_fpmode_i32_denormf32only_1() {
 define i32 @func_fpmode_i32_denormf64f16only() {
 ; GFX678-LABEL: func_fpmode_i32_denormf64f16only:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0xc0
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_denormf64f16only:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xc0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_denormf64f16only:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xc0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_denormf64f16only:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xc0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %denorm.only = and i32 %fpmode, 192
@@ -613,38 +613,38 @@ define i32 @func_fpmode_i32_denormf64f16only() {
 define i32 @func_fpmode_i32_dx10_clamp_only() {
 ; GFX678-LABEL: func_fpmode_i32_dx10_clamp_only:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x100
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_dx10_clamp_only:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x100
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_dx10_clamp_only:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x100
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_dx10_clamp_only:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x100
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %dx10.only = and i32 %fpmode, 256
@@ -654,38 +654,38 @@ define i32 @func_fpmode_i32_dx10_clamp_only() {
 define i32 @func_fpmode_i32_ieee_only() {
 ; GFX678-LABEL: func_fpmode_i32_ieee_only:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 19)
 ; GFX678-NEXT:    s_and_b32 s4, 0x7f3ff, s4
 ; GFX678-NEXT:    s_and_b32 s4, s4, 0x200
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_fpmode_i32_ieee_only:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX9-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0x200
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_fpmode_i32_ieee_only:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 24)
 ; GFX10-NEXT:    s_and_b32 s4, 0x87f3ff, s4
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0x200
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_fpmode_i32_ieee_only:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
 ; GFX11-NEXT:    s_and_b32 s0, 0x87f3ff, s0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0x200
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpmode = call i32 @llvm.get.fpmode.i32()
   %ieee.only = and i32 %fpmode, 512
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
index 79766f0555ad195..35916430faae14e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.rounding.ll
@@ -11,7 +11,6 @@ declare i32 @llvm.get.rounding()
 define i32 @func_rounding() {
 ; GFX678-LABEL: func_rounding:
 ; GFX678:       ; %bb.0:
-; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
 ; GFX678-NEXT:    s_lshl_b32 s6, s4, 2
 ; GFX678-NEXT:    s_mov_b32 s4, 0xeb24da71
@@ -22,11 +21,11 @@ define i32 @func_rounding() {
 ; GFX678-NEXT:    s_cmp_lt_u32 s4, 4
 ; GFX678-NEXT:    s_cselect_b32 s4, s4, s5
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s4
+; GFX678-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX678-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: func_rounding:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
 ; GFX9-NEXT:    s_lshl_b32 s6, s4, 2
 ; GFX9-NEXT:    s_mov_b32 s4, 0xeb24da71
@@ -37,11 +36,11 @@ define i32 @func_rounding() {
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 4
 ; GFX9-NEXT:    s_cselect_b32 s4, s4, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: func_rounding:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
 ; GFX10-NEXT:    s_lshl_b32 s6, s4, 2
 ; GFX10-NEXT:    s_mov_b32 s4, 0xeb24da71
@@ -52,11 +51,11 @@ define i32 @func_rounding() {
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 4
 ; GFX10-NEXT:    s_cselect_b32 s4, s4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_rounding:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
 ; GFX11-NEXT:    s_lshl_b32 s2, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s0, 0xeb24da71
@@ -67,6 +66,7 @@ define i32 @func_rounding() {
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 4
 ; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %rounding = call i32 @llvm.get.rounding()
   ret i32 %rounding
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index d74948a460c98b7..30bf48ce47931dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -98,32 +98,32 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 define i1 @zeromask_f16(half %x) nounwind {
 ; GFX7CHECK-LABEL: zeromask_f16:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: zeromask_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: zeromask_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: zeromask_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: zeromask_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 0)
   ret i1 %1
@@ -133,32 +133,32 @@ define i1 @zeromask_f16(half %x) nounwind {
 define i1 @allflags_f16(half %x) nounwind {
 ; GFX7CHECK-LABEL: allflags_f16:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: allflags_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: allflags_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: allflags_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: allflags_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1023) ; 0x3ff
   ret i1 %1
@@ -167,7 +167,6 @@ define i1 @allflags_f16(half %x) nounwind {
 define i1 @snan_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: snan_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7e00
 ; GFX7SELDAG-NEXT:    s_movk_i32 s5, 0x7c00
@@ -176,11 +175,11 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s5, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: snan_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -189,34 +188,35 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: snan_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: snan_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: snan_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 1
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: snan_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 1
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1)  ; 0x001
   ret i1 %1
@@ -225,50 +225,50 @@ define i1 @snan_f16(half %x) nounwind {
 define i1 @qnan_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: qnan_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7dff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: qnan_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7e00
 ; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: qnan_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 2
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: qnan_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 2
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: qnan_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 2
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: qnan_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 2
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 2)  ; 0x002
   ret i1 %1
@@ -277,50 +277,50 @@ define i1 @qnan_f16(half %x) nounwind {
 define i1 @posinf_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: posinf_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posinf_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posinf_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x200
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posinf_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x200
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posinf_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x200
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posinf_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x200
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 512)  ; 0x200
   ret i1 %1
@@ -329,48 +329,48 @@ define i1 @posinf_f16(half %x) nounwind {
 define i1 @neginf_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: neginf_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_mov_b32 s4, 0xfc00
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: neginf_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0xfc00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: neginf_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 4
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: neginf_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 4
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: neginf_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 4
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: neginf_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 4
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 4)  ; 0x004
   ret i1 %1
@@ -379,7 +379,6 @@ define i1 @neginf_f16(half %x) nounwind {
 define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: posnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7800
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -390,11 +389,11 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -405,36 +404,37 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x100
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x100
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x100
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x100
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 256)  ; 0x100
   ret i1 %1
@@ -443,7 +443,6 @@ define i1 @posnormal_f16(half %x) nounwind {
 define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: negnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7800
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -454,11 +453,11 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -469,34 +468,35 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 8
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 8
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 8
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 8
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 8)  ; 0x008
   ret i1 %1
@@ -505,53 +505,53 @@ define i1 @negnormal_f16(half %x) nounwind {
 define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: possubnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3ff
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: possubnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: possubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: possubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: possubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x80
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: possubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x80
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 128)  ; 0x080
   ret i1 %1
@@ -560,7 +560,6 @@ define i1 @possubnormal_f16(half %x) nounwind {
 define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: negsubnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -570,11 +569,11 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negsubnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v1
@@ -585,34 +584,35 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negsubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 16
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negsubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 16
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negsubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 16
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negsubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 16
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 16)  ; 0x010
   ret i1 %1
@@ -621,46 +621,46 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 define i1 @poszero_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: poszero_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: poszero_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: poszero_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 64
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: poszero_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 64
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: poszero_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 64
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: poszero_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 64
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 64)  ; 0x040
   ret i1 %1
@@ -669,48 +669,48 @@ define i1 @poszero_f16(half %x) nounwind {
 define i1 @negzero_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: negzero_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_mov_b32 s4, 0x8000
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negzero_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x8000
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negzero_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 32
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negzero_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 32
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negzero_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 32
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negzero_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 32
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 32)  ; 0x020
   ret i1 %1
@@ -719,50 +719,50 @@ define i1 @negzero_f16(half %x) nounwind {
 define i1 @posfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: posfinite_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: posfinite_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: posfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1c0
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: posfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1c0
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: posfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1c0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: posfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1c0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 448)  ; 0x1c0
   ret i1 %1
@@ -771,7 +771,6 @@ define i1 @posfinite_f16(half %x) nounwind {
 define i1 @negfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: negfinite_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -780,11 +779,11 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e64 s[4:5], s4, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: negfinite_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -793,34 +792,35 @@ define i1 @negfinite_f16(half %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v1, v0
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: negfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 56
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: negfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 56
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: negfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 56
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: negfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 56
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 56)  ; 0x038
   ret i1 %1
@@ -829,50 +829,50 @@ define i1 @negfinite_f16(half %x) nounwind {
 define i1 @isnan_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: isnan_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3)  ; nan
   ret i1 %1
@@ -881,17 +881,16 @@ define i1 @isnan_f16(half %x) nounwind {
 define i1 @not_isnan_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_isnan_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnan_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -899,36 +898,37 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnan_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3fc
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnan_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3fc
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnan_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3fc
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnan_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3fc
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = call i1 @llvm.is.fpclass.f16(half %x, i32 1020)  ; ~nan
   ret i1 %class
@@ -937,7 +937,6 @@ define i1 @not_isnan_f16(half %x) {
 define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 ; GFX7SELDAG-LABEL: isnan_v2f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
@@ -947,11 +946,11 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v1
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_v2f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
@@ -961,48 +960,49 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v2f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v1, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v2f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_sdwa s[4:5], v0, v1 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v2f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 3
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_sdwa s4, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v2f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v1, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> %x, i32 3)  ; nan
   ret <2 x i1> %1
@@ -1011,7 +1011,6 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX7SELDAG-LABEL: isnan_v3f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1025,11 +1024,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v2
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_v3f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v3, 0x7c00
@@ -1043,11 +1042,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v3
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8SELDAG-LABEL: isnan_v3f16:
 ; GFX8SELDAG:       ; %bb.0:
-; GFX8SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8SELDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8SELDAG-NEXT:    v_cmp_u_f16_e32 vcc, v2, v2
 ; GFX8SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -1056,11 +1055,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX8SELDAG-NEXT:    v_cmp_u_f16_e32 vcc, v1, v1
 ; GFX8SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8SELDAG-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8GLISEL-LABEL: isnan_v3f16:
 ; GFX8GLISEL:       ; %bb.0:
-; GFX8GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GLISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -1069,11 +1068,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX8GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v1, 3
 ; GFX8GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX8GLISEL-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9SELDAG-LABEL: isnan_v3f16:
 ; GFX9SELDAG:       ; %bb.0:
-; GFX9SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9SELDAG-NEXT:    v_cmp_u_f16_sdwa s[4:5], v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9SELDAG-NEXT:    v_cmp_u_f16_e32 vcc, v0, v0
 ; GFX9SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1081,11 +1080,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX9SELDAG-NEXT:    v_cmp_u_f16_e32 vcc, v1, v1
 ; GFX9SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9SELDAG-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9GLISEL-LABEL: isnan_v3f16:
 ; GFX9GLISEL:       ; %bb.0:
-; GFX9GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX9GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9GLISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
@@ -1095,11 +1094,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX9GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10SELDAG-LABEL: isnan_v3f16:
 ; GFX10SELDAG:       ; %bb.0:
-; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10SELDAG-NEXT:    v_cmp_u_f16_sdwa s4, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10SELDAG-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v0
 ; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1107,11 +1106,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX10SELDAG-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v1
 ; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10GLISEL-LABEL: isnan_v3f16:
 ; GFX10GLISEL:       ; %bb.0:
-; GFX10GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10GLISEL-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10GLISEL-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
 ; GFX10GLISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
@@ -1121,11 +1120,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX10GLISEL-NEXT:    v_cmp_class_f16_e64 s4, v1, 3
 ; GFX10GLISEL-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-LABEL: isnan_v3f16:
 ; GFX11SELDAG:       ; %bb.0:
-; GFX11SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11SELDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11SELDAG-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v0
 ; GFX11SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
@@ -1134,11 +1133,11 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX11SELDAG-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v1
 ; GFX11SELDAG-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-LABEL: isnan_v3f16:
 ; GFX11GLISEL:       ; %bb.0:
-; GFX11GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11GLISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11GLISEL-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
 ; GFX11GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -1147,6 +1146,7 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX11GLISEL-NEXT:    v_cmp_class_f16_e64 s0, v1, 3
 ; GFX11GLISEL-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX11GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11GLISEL-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> %x, i32 3)  ; nan
   ret <3 x i1> %1
@@ -1155,7 +1155,6 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX7SELDAG-LABEL: isnan_v4f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1173,11 +1172,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v3
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_v4f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -1195,11 +1194,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v4
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8SELDAG-LABEL: isnan_v4f16:
 ; GFX8SELDAG:       ; %bb.0:
-; GFX8SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8SELDAG-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8SELDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -1210,11 +1209,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX8SELDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8SELDAG-NEXT:    v_cmp_class_f16_e64 s[4:5], v4, 3
 ; GFX8SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX8SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8GLISEL-LABEL: isnan_v4f16:
 ; GFX8GLISEL:       ; %bb.0:
-; GFX8GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GLISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -1226,11 +1225,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX8GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v3, 3
 ; GFX8GLISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX8GLISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9SELDAG-LABEL: isnan_v4f16:
 ; GFX9SELDAG:       ; %bb.0:
-; GFX9SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9SELDAG-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9SELDAG-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9SELDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
@@ -1242,11 +1241,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX9SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX9SELDAG-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX9SELDAG-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9GLISEL-LABEL: isnan_v4f16:
 ; GFX9GLISEL:       ; %bb.0:
-; GFX9GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX9GLISEL-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9GLISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
@@ -1258,11 +1257,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX9GLISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9GLISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10SELDAG-LABEL: isnan_v4f16:
 ; GFX10SELDAG:       ; %bb.0:
-; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10SELDAG-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v0, 3
 ; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
@@ -1274,11 +1273,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v1, 3
 ; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
+; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10GLISEL-LABEL: isnan_v4f16:
 ; GFX10GLISEL:       ; %bb.0:
-; GFX10GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10GLISEL-NEXT:    v_mov_b32_e32 v3, 3
 ; GFX10GLISEL-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
 ; GFX10GLISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
@@ -1290,11 +1289,11 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX10GLISEL-NEXT:    v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10GLISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX10GLISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v4f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11CHECK-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -1305,6 +1304,7 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v4, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> %x, i32 3)  ; nan
   ret <4 x i1> %1
@@ -1313,50 +1313,50 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 ; GFX7SELDAG-LABEL: isnan_f16_strictfp:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnan_f16_strictfp:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f16_strictfp:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f16_strictfp:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f16_strictfp:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f16_strictfp:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1365,52 +1365,52 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
 define i1 @isinf_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: isinf_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isinf_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x204
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1419,52 +1419,52 @@ define i1 @isinf_f16(half %x) nounwind {
 define i1 @isfinite_f16(half %x) nounwind {
 ; GFX7SELDAG-LABEL: isfinite_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isfinite_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1f8
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1f8
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1473,50 +1473,50 @@ define i1 @isfinite_f16(half %x) nounwind {
 define i1 @issubnormal_or_zero_f16(half %x) {
 ; GFX7SELDAG-LABEL: issubnormal_or_zero_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: issubnormal_or_zero_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0xf0
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0xf0
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0xf0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_or_zero_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0xf0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 240)  ; 0xf0 = "subnormal|zero"
@@ -1526,16 +1526,15 @@ entry:
 define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_issubnormal_or_zero_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_issubnormal_or_zero_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
@@ -1548,36 +1547,37 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x30f
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x30f
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x30f
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_or_zero_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x30f
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
     %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
@@ -1587,7 +1587,6 @@ entry:
 define i1 @isnormal_f16(half %x) {
 ; GFX7SELDAG-LABEL: isnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7800
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -1595,47 +1594,48 @@ define i1 @isnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x108
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x108
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 264)  ; 0x108 = "normal"
   ret i1 %class
@@ -1644,7 +1644,6 @@ define i1 @isnormal_f16(half %x) {
 define i1 @not_isnormal_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_isnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x77ff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -1652,11 +1651,11 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1668,36 +1667,37 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x2f7
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x2f7
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x2f7
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x2f7
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 759)  ; ~0x108 = "~normal"
   ret i1 %class
@@ -1706,7 +1706,6 @@ define i1 @not_isnormal_f16(half %x) {
 define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_is_plus_normal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x77ff
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -1717,11 +1716,11 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_is_plus_normal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
@@ -1741,36 +1740,37 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x2ff
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x2ff
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x2ff
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_plus_normal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x2ff
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 767)  ; ~0x100 = ~"+normal"
   ret i1 %class
@@ -1779,7 +1779,6 @@ define i1 @not_is_plus_normal_f16(half %x) {
 define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_is_neg_normal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x77ff
 ; GFX7SELDAG-NEXT:    v_bfe_i32 v1, v0, 0, 16
@@ -1790,11 +1789,11 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_is_neg_normal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
@@ -1814,36 +1813,37 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3f7
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3f7
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3f7
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_is_neg_normal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3f7
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 1015)  ; ~0x008 = ~"-normal"
   ret i1 %class
@@ -1852,54 +1852,54 @@ define i1 @not_is_neg_normal_f16(half %x) {
 define i1 @issubnormal_f16(half %x) {
 ; GFX7SELDAG-LABEL: issubnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3ff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: issubnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x90
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x90
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %class
@@ -1908,18 +1908,17 @@ define i1 @issubnormal_f16(half %x) {
 define i1 @not_issubnormal_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_issubnormal_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x3fe
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_add_i32_e32 v0, vcc, -1, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_issubnormal_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
@@ -1934,36 +1933,37 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_issubnormal_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x36f
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_issubnormal_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x36f
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_issubnormal_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x36f
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_issubnormal_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x36f
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 879)  ; ~0x90 = ~"subnormal"
   ret i1 %class
@@ -1972,50 +1972,50 @@ define i1 @not_issubnormal_f16(half %x) {
 define i1 @iszero_f16(half %x) {
 ; GFX7SELDAG-LABEL: iszero_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x60
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x60
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 96)  ; 0x60 = "zero"
   ret i1 %class
@@ -2024,16 +2024,15 @@ define i1 @iszero_f16(half %x) {
 define i1 @not_iszero_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_iszero_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2051,36 +2050,37 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39f
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39f
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39f
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39f
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 927)  ; ~0x60 = ~"zero"
   ret i1 %class
@@ -2089,52 +2089,52 @@ define i1 @not_iszero_f16(half %x) {
 define i1 @ispositive_f16(half %x) {
 ; GFX7SELDAG-LABEL: ispositive_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: ispositive_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: ispositive_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c0
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: ispositive_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c0
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: ispositive_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3c0
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: ispositive_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3c0
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 960)  ; fcPositive
   ret i1 %class
@@ -2143,7 +2143,6 @@ define i1 @ispositive_f16(half %x) {
 define i1 @not_ispositive_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_ispositive_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7c00
 ; GFX7SELDAG-NEXT:    s_mov_b32 s7, 0xfc00
@@ -2157,11 +2156,11 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s6, v2
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_ispositive_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2175,34 +2174,35 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_ispositive_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 63
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_ispositive_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 63
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_ispositive_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 63
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_ispositive_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 63
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 63)  ; ~fcPositive
   ret i1 %class
@@ -2211,7 +2211,6 @@ define i1 @not_ispositive_f16(half %x) {
 define i1 @isnegative_f16(half %x) {
 ; GFX7SELDAG-LABEL: isnegative_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    s_mov_b32 s6, 0xfc00
@@ -2223,11 +2222,11 @@ define i1 @isnegative_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isnegative_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2239,34 +2238,35 @@ define i1 @isnegative_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnegative_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 60
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnegative_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, 60
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnegative_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 60
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnegative_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 60
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 60)  ; fcNegative
   ret i1 %class
@@ -2275,7 +2275,6 @@ define i1 @isnegative_f16(half %x) {
 define i1 @not_isnegative_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_isnegative_f16:
 ; GFX7SELDAG:       ; %bb.0:
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    s_movk_i32 s5, 0x7c00
@@ -2284,11 +2283,11 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e64 s[4:5], s5, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isnegative_f16:
 ; GFX7GLISEL:       ; %bb.0:
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
@@ -2299,36 +2298,37 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isnegative_f16:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c3
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isnegative_f16:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x3c3
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isnegative_f16:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x3c3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isnegative_f16:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x3c3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 963)  ; ~fcNegative
   ret i1 %class
@@ -2337,7 +2337,6 @@ define i1 @not_isnegative_f16(half %x) {
 define i1 @iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-LABEL: iszero_or_nan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2345,11 +2344,11 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -2357,36 +2356,37 @@ define i1 @iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2396,7 +2396,6 @@ entry:
 define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-LABEL: iszero_or_nan_f_daz:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2404,11 +2403,11 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f_daz:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -2416,36 +2415,37 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2455,7 +2455,6 @@ entry:
 define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2463,11 +2462,11 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -2475,36 +2474,37 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x63
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x63
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x63
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99)  ; 0x60|0x3 = "zero|nan"
@@ -2514,7 +2514,6 @@ entry:
 define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_iszero_or_nan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2522,11 +2521,11 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2542,36 +2541,37 @@ define i1 @not_iszero_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~0x60 = "~(zero|nan)"
@@ -2581,7 +2581,6 @@ entry:
 define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-LABEL: not_iszero_or_nan_f_daz:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2589,11 +2588,11 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_daz:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2609,36 +2608,37 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2648,7 +2648,6 @@ entry:
 define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c01
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2656,11 +2655,11 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2676,36 +2675,37 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39c
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39c
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39c
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
@@ -2715,7 +2715,6 @@ entry:
 define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-LABEL: iszero_or_qnan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7dff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -2723,11 +2722,11 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_qnan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7e00
@@ -2735,36 +2734,37 @@ define i1 @iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x62
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x62
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x62
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_qnan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x62
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 98)  ; 0x60|0x2 = "zero|qnan"
@@ -2774,7 +2774,6 @@ entry:
 define i1 @iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-LABEL: iszero_or_snan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7e00
 ; GFX7SELDAG-NEXT:    s_movk_i32 s5, 0x7c00
@@ -2785,11 +2784,11 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: iszero_or_snan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -2800,36 +2799,37 @@ define i1 @iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_or_snan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x61
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_or_snan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x61
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_or_snan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x61
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_or_snan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x61
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 97)  ; 0x60|0x1 = "zero|snan"
@@ -2839,7 +2839,6 @@ entry:
 define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_iszero_or_qnan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7e00
 ; GFX7SELDAG-NEXT:    s_movk_i32 s8, 0x7c00
@@ -2859,11 +2858,11 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_qnan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2884,36 +2883,37 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39d
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39d
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39d
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_qnan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39d
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 925)  ; ~(0x60|0x2) = "~(zero|qnan)"
@@ -2923,7 +2923,6 @@ entry:
 define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_iszero_or_snan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    s_movk_i32 s6, 0x7dff
@@ -2941,11 +2940,11 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7SELDAG-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
 ; GFX7SELDAG-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_iszero_or_snan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v1, vcc, 1, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -2964,36 +2963,37 @@ define i1 @not_iszero_or_snan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x39e
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x39e
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x39e
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_iszero_or_snan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x39e
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 926)  ; ~(0x60|0x1) = "~(zero|snan)"
@@ -3003,17 +3003,16 @@ entry:
 define i1 @isinf_or_nan_f16(half %x) {
 ; GFX7SELDAG-LABEL: isinf_or_nan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7bff
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_lt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isinf_or_nan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -3021,36 +3020,37 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x207
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x207
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x207
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x207
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 519)  ; 0x204|0x3 = "inf|nan"
@@ -3060,52 +3060,52 @@ entry:
 define i1 @not_isinf_or_nan_f16(half %x) {
 ; GFX7SELDAG-LABEL: not_isinf_or_nan_f16:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_gt_i32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isinf_or_nan_f16:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1f8
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isinf_or_nan_f16:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1f8
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 504)  ; ~(0x204|0x3) = "~(inf|nan)"
@@ -3115,17 +3115,16 @@ entry:
 define i1 @isfinite_or_nan_f(half %x) {
 ; GFX7SELDAG-LABEL: isfinite_or_nan_f:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_ne_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: isfinite_or_nan_f:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
@@ -3133,36 +3132,37 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1fb
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1fb
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x1fb
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x1fb
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 507)  ; 0x1f8|0x3 = "finite|nan"
@@ -3172,52 +3172,52 @@ entry:
 define i1 @not_isfinite_or_nan_f(half %x) {
 ; GFX7SELDAG-LABEL: not_isfinite_or_nan_f:
 ; GFX7SELDAG:       ; %bb.0: ; %entry
-; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7SELDAG-NEXT:    s_movk_i32 s4, 0x7c00
 ; GFX7SELDAG-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7SELDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
 ; GFX7SELDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7GLISEL-LABEL: not_isfinite_or_nan_f:
 ; GFX7GLISEL:       ; %bb.0: ; %entry
-; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX8CHECK:       ; %bb.0: ; %entry
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX8CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX9CHECK:       ; %bb.0: ; %entry
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX9CHECK-NEXT:    v_cmp_class_f16_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX10CHECK:       ; %bb.0: ; %entry
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f16_e64 s4, v0, 0x204
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: not_isfinite_or_nan_f:
 ; GFX11CHECK:       ; %bb.0: ; %entry
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 516)  ; ~(0x1f8|0x3) = "~(finite|nan)"
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 638be76bf65786a..24230b75ff51288 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -178,38 +178,38 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
 define i1 @isnan_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; nan
   ret i1 %1
@@ -218,48 +218,48 @@ define i1 @isnan_f32(float %x) nounwind {
 define <2 x i1> @isnan_v2f32(<2 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v2f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v2f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v2f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v2f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v2f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v1, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 3)  ; nan
   ret <2 x i1> %1
@@ -268,51 +268,50 @@ define <2 x i1> @isnan_v2f32(<2 x float> %x) nounwind {
 define <3 x i1> @isnan_v3f32(<3 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v3f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v2, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v3f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v2, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v3f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v2, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v3f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v2, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v3f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -321,6 +320,7 @@ define <3 x i1> @isnan_v3f32(<3 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v2, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <3 x i1> @llvm.is.fpclass.v3f32(<3 x float> %x, i32 3)  ; nan
   ret <3 x i1> %1
@@ -329,7 +329,6 @@ define <3 x i1> @isnan_v3f32(<3 x float> %x) nounwind {
 define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v4f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -338,11 +337,11 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v3, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v4f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -351,11 +350,11 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v3, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v4f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -364,11 +363,11 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v3, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v4f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -377,11 +376,11 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v3, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v4f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -392,6 +391,7 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v3, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %x, i32 3)  ; nan
   ret <4 x i1> %1
@@ -400,7 +400,6 @@ define <4 x i1> @isnan_v4f32(<4 x float> %x) nounwind {
 define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v5f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -411,11 +410,11 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v4, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v5f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -426,11 +425,11 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v4, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v5f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -441,11 +440,11 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v4, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v5f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -456,11 +455,11 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v4, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v5f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -474,6 +473,7 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v4, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <5 x i1> @llvm.is.fpclass.v5f32(<5 x float> %x, i32 3)  ; nan
   ret <5 x i1> %1
@@ -482,7 +482,6 @@ define <5 x i1> @isnan_v5f32(<5 x float> %x) nounwind {
 define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v6f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -495,11 +494,11 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v5, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v6f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -512,11 +511,11 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v5, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v6f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -529,11 +528,11 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v5, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v6f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -546,11 +545,11 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v5, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v6f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -566,6 +565,7 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v5, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <6 x i1> @llvm.is.fpclass.v6f32(<6 x float> %x, i32 3)  ; nan
   ret <6 x i1> %1
@@ -574,7 +574,6 @@ define <6 x i1> @isnan_v6f32(<6 x float> %x) nounwind {
 define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v7f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -589,11 +588,11 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v6, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v7f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -608,11 +607,11 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v6, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v7f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -627,11 +626,11 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v6, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v7f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -646,11 +645,11 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v6, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v7f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -669,6 +668,7 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v6, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <7 x i1> @llvm.is.fpclass.v7f32(<7 x float> %x, i32 3)  ; nan
   ret <7 x i1> %1
@@ -677,7 +677,6 @@ define <7 x i1> @isnan_v7f32(<7 x float> %x) nounwind {
 define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v8f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -694,11 +693,11 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v7, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v8f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -715,11 +714,11 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v7, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v8f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -736,11 +735,11 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v7, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v8f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -757,11 +756,11 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v7, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v8f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -782,6 +781,7 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v7, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <8 x i1> @llvm.is.fpclass.v8f32(<8 x float> %x, i32 3)  ; nan
   ret <8 x i1> %1
@@ -790,7 +790,6 @@ define <8 x i1> @isnan_v8f32(<8 x float> %x) nounwind {
 define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_v16f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -823,11 +822,11 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v15, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_v16f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -860,11 +859,11 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v15, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_v16f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, 3
@@ -897,11 +896,11 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v15, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_v16f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v1, 3
@@ -934,11 +933,11 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v15, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_v16f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -979,6 +978,7 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v15, 3
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <16 x i1> @llvm.is.fpclass.v16f32(<16 x float> %x, i32 3)  ; nan
   ret <16 x i1> %1
@@ -987,38 +987,38 @@ define <16 x i1> @isnan_v16f32(<16 x float> %x) nounwind {
 define i1 @isnan_f64(double %x) nounwind {
 ; GFX7CHECK-LABEL: isnan_f64:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f64:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f64:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f64:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; nan
   ret i1 %1
@@ -1027,38 +1027,38 @@ define i1 @isnan_f64(double %x) nounwind {
 define i1 @isnan_f32_strictfp(float %x) strictfp nounwind {
 ; GFX7CHECK-LABEL: isnan_f32_strictfp:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f32_strictfp:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f32_strictfp:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f32_strictfp:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f32_strictfp:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1067,38 +1067,38 @@ define i1 @isnan_f32_strictfp(float %x) strictfp nounwind {
 define i1 @isnan_f64_strictfp(double %x) strictfp nounwind {
 ; GFX7CHECK-LABEL: isnan_f64_strictfp:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnan_f64_strictfp:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnan_f64_strictfp:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e64 s[4:5], v[0:1], 3
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnan_f64_strictfp:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 3
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnan_f64_strictfp:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 3
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan
   ret i1 %1
@@ -1107,41 +1107,41 @@ define i1 @isnan_f64_strictfp(double %x) strictfp nounwind {
 define i1 @isinf_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: isinf_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x204
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x204
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x204
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1150,41 +1150,41 @@ define i1 @isinf_f32(float %x) nounwind {
 define i1 @isinf_f64(double %x) nounwind {
 ; GFX7CHECK-LABEL: isinf_f64:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isinf_f64:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isinf_f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v2, 0x204
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isinf_f64:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 0x204
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isinf_f64:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x204
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 516)  ; 0x204 = "inf"
   ret i1 %1
@@ -1193,41 +1193,41 @@ define i1 @isinf_f64(double %x) nounwind {
 define i1 @isfinite_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: isfinite_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f8
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x1f8
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x1f8
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1236,41 +1236,41 @@ define i1 @isfinite_f32(float %x) nounwind {
 define i1 @isfinite_f64(double %x) nounwind {
 ; GFX7CHECK-LABEL: isfinite_f64:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isfinite_f64:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isfinite_f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f8
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v2
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isfinite_f64:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 0x1f8
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isfinite_f64:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x1f8
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %1
@@ -1279,41 +1279,41 @@ define i1 @isfinite_f64(double %x) nounwind {
 define i1 @isnormal_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: isnormal_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x108
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x108
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x108
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264)  ; 0x108 = "normal"
   ret i1 %1
@@ -1322,51 +1322,51 @@ define i1 @isnormal_f32(float %x) nounwind {
 define <2 x i1> @isnormal_v2f64(<2 x double> %x) nounwind {
 ; GFX7CHECK-LABEL: isnormal_v2f64:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v4, 0x108
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX7CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v4
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: isnormal_v2f64:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v4, 0x108
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX8CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v4
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: isnormal_v2f64:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v4, 0x108
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v4
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9CHECK-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v4
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: isnormal_v2f64:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[0:1], 0x108
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10CHECK-NEXT:    v_cmp_class_f64_e64 s4, v[2:3], 0x108
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: isnormal_v2f64:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[0:1], 0x108
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11CHECK-NEXT:    v_cmp_class_f64_e64 s0, v[2:3], 0x108
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call <2 x i1> @llvm.is.fpclass.v2f64(<2 x double> %x, i32 264)  ; 0x108 = "normal"
   ret <2 x i1> %1
@@ -1375,41 +1375,41 @@ define <2 x i1> @isnormal_v2f64(<2 x double> %x) nounwind {
 define i1 @issubnormal_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: issubnormal_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: issubnormal_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: issubnormal_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x90
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: issubnormal_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x90
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: issubnormal_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x90
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %1
@@ -1418,41 +1418,41 @@ define i1 @issubnormal_f32(float %x) nounwind {
 define i1 @iszero_f32(float %x) nounwind {
 ; GFX7CHECK-LABEL: iszero_f32:
 ; GFX7CHECK:       ; %bb.0:
-; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
 ; GFX7CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX7CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8CHECK-LABEL: iszero_f32:
 ; GFX8CHECK:       ; %bb.0:
-; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
 ; GFX8CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX8CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9CHECK-LABEL: iszero_f32:
 ; GFX9CHECK:       ; %bb.0:
-; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x60
 ; GFX9CHECK-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
 ; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10CHECK-LABEL: iszero_f32:
 ; GFX10CHECK:       ; %bb.0:
-; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    v_cmp_class_f32_e64 s4, v0, 0x60
 ; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10CHECK-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11CHECK-LABEL: iszero_f32:
 ; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    v_cmp_class_f32_e64 s0, v0, 0x60
 ; GFX11CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11CHECK-NEXT:    s_setpc_b64 s[30:31]
   %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
   ret i1 %1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 272addf59085ea5..c980940b964054d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -12,26 +12,26 @@
 define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) {
 ; GFX6-LABEL: test_ldexp_f32_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_f32_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_f32_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_f32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.ldexp.f32.i32(float %a, i32 %b)
   ret float %result
@@ -40,30 +40,30 @@ define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) {
 define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a, <2 x i32> %b) {
 ; GFX6-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v4
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v3, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX8-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX9-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %a, <2 x i32> %b)
   ret <2 x float> %result
@@ -72,34 +72,34 @@ define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a
 define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a, <3 x i32> %b) {
 ; GFX6-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v5
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v3, v6
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX8-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX8-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX9-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX9-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX11-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.ldexp.v3f32.v3i32(<3 x float> %a, <3 x i32> %b)
   ret <3 x float> %result
@@ -108,38 +108,38 @@ define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a
 define <4 x float> @test_ldexp_v4f32_v4i32(ptr addrspace(1) %out, <4 x float> %a, <4 x i32> %b) {
 ; GFX6-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v6
 ; GFX6-NEXT:    v_ldexp_f32_e32 v1, v3, v7
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v8
 ; GFX6-NEXT:    v_ldexp_f32_e32 v3, v5, v9
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX8-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX8-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX8-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX9-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX9-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX9-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX11-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX11-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %a, <4 x i32> %b)
   ret <4 x float> %result
@@ -148,8 +148,8 @@ define <4 x float> @test_ldexp_v4f32_v4i32(ptr addrspace(1) %out, <4 x float> %a
 define double @test_ldexp_f64_i32(double %a, i32 %b) {
 ; GCN-LABEL: test_ldexp_f64_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.ldexp.f64.i32(double %a, i32 %b)
   ret double %result
@@ -158,9 +158,9 @@ define double @test_ldexp_f64_i32(double %a, i32 %b) {
 define <2 x double> @test_ldexp_v2f64_v2i32(<2 x double> %a, <2 x i32> %b) {
 ; GCN-LABEL: test_ldexp_v2f64_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
 ; GCN-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %a, <2 x i32> %b)
   ret <2 x double> %result
@@ -192,70 +192,70 @@ define <2 x double> @test_ldexp_v2f64_v2i32(<2 x double> %a, <2 x i32> %b) {
 define half @test_ldexp_f16_i8(half %a, i8 %b) {
 ; GFX6-SDAG-LABEL: test_ldexp_f16_i8:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_f16_i8:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_f16_i8:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_f16_i8:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i8:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_f16_i8:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_f16_i8:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_f16_i8:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
 ; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i8(half %a, i8 %b)
   ret half %result
@@ -264,38 +264,38 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
 define half @test_ldexp_f16_i16(half %a, i16 %b) {
 ; GFX6-SDAG-LABEL: test_ldexp_f16_i16:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_f16_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_f16_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_f16_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i16(half %a, i16 %b)
   ret half %result
@@ -304,72 +304,72 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
 define half @test_ldexp_f16_i32(half %a, i32 %b) {
 ; GFX6-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v2
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
 ; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i32(half %a, i32 %b)
   ret half %result
@@ -378,18 +378,17 @@ define half @test_ldexp_f16_i32(half %a, i32 %b) {
 define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
@@ -397,11 +396,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
@@ -409,11 +408,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
@@ -423,22 +422,22 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
@@ -446,11 +445,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
@@ -458,11 +457,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -474,6 +473,7 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b)
   ret <2 x half> %result
@@ -482,7 +482,6 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX6-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 16
@@ -491,38 +490,38 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX6-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v1, v3
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v3, v2
 ; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
@@ -531,27 +530,27 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v2, v0, v1
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
@@ -560,6 +559,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> %a, <2 x i16> %b)
   ret <2 x half> %result
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index a0b2d3b32b7957f..49b57abaf9f2595 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -2079,7 +2079,6 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 define float @v_log_f32(float %in) {
 ; SI-SDAG-LABEL: v_log_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2098,11 +2097,11 @@ define float @v_log_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2121,11 +2120,11 @@ define float @v_log_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2147,11 +2146,11 @@ define float @v_log_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2173,11 +2172,11 @@ define float @v_log_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2196,11 +2195,11 @@ define float @v_log_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2219,11 +2218,11 @@ define float @v_log_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2241,11 +2240,11 @@ define float @v_log_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2263,6 +2262,7 @@ define float @v_log_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32:
@@ -2281,7 +2281,6 @@ define float @v_log_f32(float %in) {
 define float @v_log_fabs_f32(float %in) {
 ; SI-SDAG-LABEL: v_log_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2300,11 +2299,11 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2323,11 +2322,11 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2349,11 +2348,11 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2375,11 +2374,11 @@ define float @v_log_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2398,11 +2397,11 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2421,11 +2420,11 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2443,11 +2442,11 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2465,6 +2464,7 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fabs_f32:
@@ -2484,7 +2484,6 @@ define float @v_log_fabs_f32(float %in) {
 define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-SDAG-LABEL: v_log_fneg_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2503,11 +2502,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2526,11 +2525,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2552,11 +2551,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fneg_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2578,11 +2577,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2601,11 +2600,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2624,11 +2623,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2646,11 +2645,11 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fneg_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2668,6 +2667,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_fabs_f32:
@@ -2688,7 +2688,6 @@ define float @v_log_fneg_fabs_f32(float %in) {
 define float @v_log_fneg_f32(float %in) {
 ; SI-SDAG-LABEL: v_log_fneg_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2707,11 +2706,11 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2730,11 +2729,11 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2756,11 +2755,11 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fneg_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2782,11 +2781,11 @@ define float @v_log_fneg_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2805,11 +2804,11 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2828,11 +2827,11 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2850,11 +2849,11 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fneg_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2872,6 +2871,7 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_f32:
@@ -2891,7 +2891,6 @@ define float @v_log_fneg_f32(float %in) {
 define float @v_log_f32_fast(float %in) {
 ; SI-SDAG-LABEL: v_log_f32_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2902,11 +2901,11 @@ define float @v_log_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -2914,11 +2913,11 @@ define float @v_log_f32_fast(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2929,11 +2928,11 @@ define float @v_log_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -2941,11 +2940,11 @@ define float @v_log_f32_fast(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2956,11 +2955,11 @@ define float @v_log_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -2968,11 +2967,11 @@ define float @v_log_f32_fast(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
@@ -2981,16 +2980,17 @@ define float @v_log_f32_fast(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_fast:
@@ -3009,7 +3009,6 @@ define float @v_log_f32_fast(float %in) {
 define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-LABEL: v_log_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3020,11 +3019,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_unsafe_math_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3032,11 +3031,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3047,11 +3046,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_unsafe_math_attr:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3059,11 +3058,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3074,11 +3073,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_unsafe_math_attr:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3086,11 +3085,11 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
@@ -3099,16 +3098,17 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_unsafe_math_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_unsafe_math_attr:
@@ -3127,7 +3127,6 @@ define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-SDAG-LABEL: v_log_f32_approx_fn_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3138,11 +3137,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_approx_fn_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3150,11 +3149,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3165,11 +3164,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_approx_fn_attr:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3177,11 +3176,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3192,11 +3191,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_approx_fn_attr:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3204,11 +3203,11 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
@@ -3217,16 +3216,17 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_approx_fn_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_approx_fn_attr:
@@ -3245,7 +3245,6 @@ define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 define float @v_log_f32_ninf(float %in) {
 ; SI-SDAG-LABEL: v_log_f32_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3264,11 +3263,11 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3287,11 +3286,11 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3313,11 +3312,11 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3339,11 +3338,11 @@ define float @v_log_f32_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3362,11 +3361,11 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3385,11 +3384,11 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3407,11 +3406,11 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3429,6 +3428,7 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_ninf:
@@ -3447,7 +3447,6 @@ define float @v_log_f32_ninf(float %in) {
 define float @v_log_f32_afn(float %in) {
 ; SI-SDAG-LABEL: v_log_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3458,11 +3457,11 @@ define float @v_log_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3470,11 +3469,11 @@ define float @v_log_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3485,11 +3484,11 @@ define float @v_log_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3497,11 +3496,11 @@ define float @v_log_f32_afn(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3512,11 +3511,11 @@ define float @v_log_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3524,11 +3523,11 @@ define float @v_log_f32_afn(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
@@ -3537,16 +3536,17 @@ define float @v_log_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_afn:
@@ -3565,17 +3565,17 @@ define float @v_log_f32_afn(float %in) {
 define float @v_log_f32_afn_daz(float %in) #0 {
 ; GFX689-LABEL: v_log_f32_afn_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_f32_afn_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_afn_daz:
@@ -3594,7 +3594,6 @@ define float @v_log_f32_afn_daz(float %in) #0 {
 define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log_f32_afn_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3605,11 +3604,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_afn_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3617,11 +3616,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3632,11 +3631,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_afn_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3644,11 +3643,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3659,11 +3658,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_afn_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3671,11 +3670,11 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo
@@ -3684,16 +3683,17 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_afn_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_afn_dynamic:
@@ -3712,7 +3712,6 @@ define float @v_log_f32_afn_dynamic(float %in) #1 {
 define float @v_fabs_log_f32_afn(float %in) {
 ; SI-SDAG-LABEL: v_fabs_log_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3723,11 +3722,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_fabs_log_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3735,11 +3734,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_log_f32_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3750,11 +3749,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_fabs_log_f32_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3762,11 +3761,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_fabs_log_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3777,11 +3776,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317218
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_fabs_log_f32_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc1b17218
@@ -3789,11 +3788,11 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317218
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_fabs_log_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -3803,17 +3802,18 @@ define float @v_fabs_log_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3f317218, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_fabs_log_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e64 v1, |v0|
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc1b17218, s0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3f317218, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fabs_log_f32_afn:
@@ -3833,7 +3833,6 @@ define float @v_fabs_log_f32_afn(float %in) {
 define float @v_log_f32_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log_f32_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -3844,11 +3843,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -3859,11 +3858,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -3877,11 +3876,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -3895,11 +3894,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -3910,11 +3909,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -3925,11 +3924,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -3940,11 +3939,11 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -3955,6 +3954,7 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_daz:
@@ -3973,7 +3973,6 @@ define float @v_log_f32_daz(float %in) #0 {
 define float @v_log_f32_nnan(float %in) {
 ; SI-SDAG-LABEL: v_log_f32_nnan:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3992,11 +3991,11 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4015,11 +4014,11 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4041,11 +4040,11 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_nnan:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4067,11 +4066,11 @@ define float @v_log_f32_nnan(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4090,11 +4089,11 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4113,11 +4112,11 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4135,11 +4134,11 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4157,6 +4156,7 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan:
@@ -4175,7 +4175,6 @@ define float @v_log_f32_nnan(float %in) {
 define float @v_log_f32_nnan_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log_f32_nnan_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -4186,11 +4185,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -4201,11 +4200,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_nnan_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -4219,11 +4218,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_nnan_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -4237,11 +4236,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -4252,11 +4251,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -4267,11 +4266,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -4282,11 +4281,11 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -4297,6 +4296,7 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan_daz:
@@ -4315,7 +4315,6 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log_f32_nnan_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4334,11 +4333,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4357,11 +4356,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4383,11 +4382,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_nnan_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4409,11 +4408,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4432,11 +4431,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4455,11 +4454,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4477,11 +4476,11 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4499,6 +4498,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan_dynamic:
@@ -4517,7 +4517,6 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 define float @v_log_f32_ninf_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log_f32_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -4528,11 +4527,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -4543,11 +4542,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_ninf_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -4561,11 +4560,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_ninf_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -4579,11 +4578,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -4594,11 +4593,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -4609,11 +4608,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_ninf_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -4624,11 +4623,11 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_ninf_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -4639,6 +4638,7 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_ninf_daz:
@@ -4657,7 +4657,6 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log_f32_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4676,11 +4675,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4699,11 +4698,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4725,11 +4724,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4751,11 +4750,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4774,11 +4773,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4797,11 +4796,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4819,11 +4818,11 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4841,6 +4840,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_ninf_dynamic:
@@ -4859,7 +4859,6 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-SDAG-LABEL: v_log_f32_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4875,11 +4874,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4895,11 +4894,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4918,11 +4917,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4941,11 +4940,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4961,11 +4960,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4981,11 +4980,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5000,11 +4999,11 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5019,6 +5018,7 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan_ninf:
@@ -5037,7 +5037,6 @@ define float @v_log_f32_nnan_ninf(float %in) {
 define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log_f32_nnan_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5045,11 +5044,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3377d1cf
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5057,11 +5056,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_f32_nnan_ninf_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -5072,11 +5071,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5084,11 +5083,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3377d1cf
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5096,11 +5095,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5109,11 +5108,11 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3377d1cf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5122,6 +5121,7 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan_ninf_daz:
@@ -5140,7 +5140,6 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5156,11 +5155,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5176,11 +5175,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5199,11 +5198,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5222,11 +5221,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5242,11 +5241,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5262,11 +5261,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5281,11 +5280,11 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_nnan_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5300,6 +5299,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_nnan_ninf_dynamic:
@@ -5318,17 +5318,17 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 define float @v_log_f32_fast_daz(float %in) #0 {
 ; GFX689-LABEL: v_log_f32_fast_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_f32_fast_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_fast_daz:
@@ -5347,7 +5347,6 @@ define float @v_log_f32_fast_daz(float %in) #0 {
 define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-LABEL: v_log_f32_dynamic_mode:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5366,11 +5365,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_dynamic_mode:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5389,11 +5388,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5415,11 +5414,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_dynamic_mode:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5441,11 +5440,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5464,11 +5463,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_dynamic_mode:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5487,11 +5486,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5509,11 +5508,11 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_dynamic_mode:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5531,6 +5530,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_dynamic_mode:
@@ -5549,7 +5549,6 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 define float @v_log_f32_undef() {
 ; SI-SDAG-LABEL: v_log_f32_undef:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5560,11 +5559,11 @@ define float @v_log_f32_undef() {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_undef:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5583,11 +5582,11 @@ define float @v_log_f32_undef() {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_undef:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5601,11 +5600,11 @@ define float @v_log_f32_undef() {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_undef:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5627,11 +5626,11 @@ define float @v_log_f32_undef() {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_undef:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5642,11 +5641,11 @@ define float @v_log_f32_undef() {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_undef:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5665,11 +5664,11 @@ define float @v_log_f32_undef() {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_undef:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5680,11 +5679,11 @@ define float @v_log_f32_undef() {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_undef:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5702,6 +5701,7 @@ define float @v_log_f32_undef() {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_undef:
@@ -5720,7 +5720,6 @@ define float @v_log_f32_undef() {
 define float @v_log_f32_0() {
 ; SI-SDAG-LABEL: v_log_f32_0:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5732,11 +5731,11 @@ define float @v_log_f32_0() {
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, 0xc1b17218, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_0:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5752,11 +5751,11 @@ define float @v_log_f32_0() {
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_0:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5771,11 +5770,11 @@ define float @v_log_f32_0() {
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, 0xc1b17218, v0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_0:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
@@ -5794,11 +5793,11 @@ define float @v_log_f32_0() {
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_0:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5810,11 +5809,11 @@ define float @v_log_f32_0() {
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, 0xc1b17218, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_0:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
@@ -5830,11 +5829,11 @@ define float @v_log_f32_0() {
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_0:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
@@ -5847,11 +5846,11 @@ define float @v_log_f32_0() {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, 0xc1b17218, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_0:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; GFX1100-GISEL-NEXT:    v_cmp_lt_f32_e64 s0, 0, 0x800000
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
@@ -5866,6 +5865,7 @@ define float @v_log_f32_0() {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_0:
@@ -5884,7 +5884,6 @@ define float @v_log_f32_0() {
 define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; SI-SDAG-LABEL: v_log_f32_from_fpext_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5896,11 +5895,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_from_fpext_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -5912,11 +5911,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_from_fpext_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
@@ -5931,11 +5930,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_from_fpext_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -5950,11 +5949,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_from_fpext_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3377d1cf
@@ -5966,11 +5965,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_from_fpext_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -5982,11 +5981,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_from_fpext_f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
@@ -5999,11 +5998,11 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_from_fpext_f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -6016,6 +6015,7 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_f16:
@@ -6036,7 +6036,6 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-LABEL: v_log_f32_from_fpext_math_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
@@ -6058,11 +6057,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f32_from_fpext_math_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
@@ -6078,11 +6077,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_f32_from_fpext_math_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -6098,11 +6097,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_f32_from_fpext_math_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
@@ -6118,11 +6117,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_f32_from_fpext_math_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3f317217
@@ -6135,11 +6134,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_f32_from_fpext_math_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
@@ -6152,11 +6151,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_from_fpext_math_f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6170,11 +6169,11 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_f32_from_fpext_math_f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6188,6 +6187,7 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_math_f16:
@@ -6210,7 +6210,6 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; SI-LABEL: v_log_f32_from_fpext_bf16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6229,11 +6228,11 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6255,11 +6254,11 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6278,11 +6277,11 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6300,6 +6299,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_bf16:
@@ -6319,44 +6319,44 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
 define half @v_log_f16(half %in) {
 ; SI-SDAG-LABEL: v_log_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f16:
@@ -6375,44 +6375,44 @@ define half @v_log_f16(half %in) {
 define half @v_log_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, |v0|
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fabs_f16:
@@ -6432,44 +6432,44 @@ define half @v_log_fabs_f16(half %in) {
 define half @v_log_fneg_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_fneg_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -|v0|
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_fneg_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_fneg_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_fabs_f16:
@@ -6490,44 +6490,44 @@ define half @v_log_fneg_fabs_f16(half %in) {
 define half @v_log_fneg_f16(half %in) {
 ; SI-SDAG-LABEL: v_log_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_fneg_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_fneg_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_f16:
@@ -6547,44 +6547,44 @@ define half @v_log_fneg_f16(half %in) {
 define half @v_log_f16_fast(half %in) {
 ; SI-SDAG-LABEL: v_log_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3f317218, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f16_fast:
@@ -6603,7 +6603,6 @@ define half @v_log_f16_fast(half %in) {
 define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6616,11 +6615,11 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -6629,43 +6628,43 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_v2f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_v2f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6674,6 +6673,7 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v2f16:
@@ -6692,7 +6692,6 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
@@ -6705,11 +6704,11 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6723,22 +6722,22 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6746,32 +6745,32 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6780,11 +6779,11 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -6795,6 +6794,7 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fabs_v2f16:
@@ -6814,7 +6814,6 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6831,11 +6830,11 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6849,22 +6848,22 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6872,32 +6871,32 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6906,11 +6905,11 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fneg_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -6921,6 +6920,7 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_fabs_v2f16:
@@ -6941,7 +6941,6 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log_fneg_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6958,11 +6957,11 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_fneg_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6976,22 +6975,22 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_fneg_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_fneg_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6999,32 +6998,32 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_fneg_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -7033,11 +7032,11 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_fneg_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -7048,6 +7047,7 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_v2f16:
@@ -7067,7 +7067,6 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v2f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -7080,11 +7079,11 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -7093,43 +7092,43 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317218, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v2f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x398c
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_v2f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_v2f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -7138,6 +7137,7 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v2f16_fast:
@@ -7156,7 +7156,6 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v3f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -7175,11 +7174,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v3f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7192,11 +7191,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v2, v0
 ; VI-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_log_f16_e32 v1, v1
@@ -7205,11 +7204,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_v3f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_log_f16_e32 v1, v1
@@ -7217,11 +7216,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_v3f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -7233,6 +7232,7 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v3f16:
@@ -7251,7 +7251,6 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v3f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -7270,11 +7269,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v3f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7287,11 +7286,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_v3f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v2, v0
 ; VI-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_log_f16_e32 v1, v1
@@ -7300,11 +7299,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log_v3f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_log_f16_e32 v1, v1
@@ -7312,11 +7311,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log_v3f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -7328,6 +7327,7 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v3f16_fast:
@@ -7346,7 +7346,6 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v4f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -7371,11 +7370,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v4f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7392,11 +7391,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v4f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7408,11 +7407,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v3, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_v4f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
@@ -7424,11 +7423,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7439,11 +7438,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
@@ -7454,11 +7453,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_v4f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -7475,11 +7474,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_v4f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -7496,6 +7495,7 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v4f16:
@@ -7514,7 +7514,6 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log_v4f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -7539,11 +7538,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7560,11 +7559,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log_v4f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7576,11 +7575,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v3, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log_v4f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
@@ -7592,11 +7591,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7607,11 +7606,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
@@ -7622,11 +7621,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_v4f16_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -7643,11 +7642,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log_v4f16_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -7664,6 +7663,7 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 5ba72612321a6ad..6a69317baa16dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -2079,7 +2079,6 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 define float @v_log10_f32(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2098,11 +2097,11 @@ define float @v_log10_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2121,11 +2120,11 @@ define float @v_log10_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2147,11 +2146,11 @@ define float @v_log10_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2173,11 +2172,11 @@ define float @v_log10_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2196,11 +2195,11 @@ define float @v_log10_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2219,11 +2218,11 @@ define float @v_log10_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2241,11 +2240,11 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2263,6 +2262,7 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32:
@@ -2281,7 +2281,6 @@ define float @v_log10_f32(float %in) {
 define float @v_log10_fabs_f32(float %in) {
 ; SI-SDAG-LABEL: v_log10_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2300,11 +2299,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2323,11 +2322,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2349,11 +2348,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2375,11 +2374,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -2398,11 +2397,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -2421,11 +2420,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2443,11 +2442,11 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2465,6 +2464,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fabs_f32:
@@ -2484,7 +2484,6 @@ define float @v_log10_fabs_f32(float %in) {
 define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_fabs_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2503,11 +2502,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2526,11 +2525,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_fabs_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2552,11 +2551,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fneg_fabs_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2578,11 +2577,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fneg_fabs_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
@@ -2601,11 +2600,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -2624,11 +2623,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2646,11 +2645,11 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fneg_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2668,6 +2667,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_fabs_f32:
@@ -2688,7 +2688,6 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 define float @v_log10_fneg_f32(float %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_f32:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2707,11 +2706,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2730,11 +2729,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_f32:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2756,11 +2755,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fneg_f32:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2782,11 +2781,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fneg_f32:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -2805,11 +2804,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_f32:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -2828,11 +2827,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2850,11 +2849,11 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fneg_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -2872,6 +2871,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_f32:
@@ -2891,7 +2891,6 @@ define float @v_log10_fneg_f32(float %in) {
 define float @v_log10_f32_fast(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2902,11 +2901,11 @@ define float @v_log10_f32_fast(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -2914,11 +2913,11 @@ define float @v_log10_f32_fast(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2929,11 +2928,11 @@ define float @v_log10_f32_fast(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -2941,11 +2940,11 @@ define float @v_log10_f32_fast(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2956,11 +2955,11 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -2968,11 +2967,11 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
@@ -2981,16 +2980,17 @@ define float @v_log10_f32_fast(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_fast:
@@ -3009,7 +3009,6 @@ define float @v_log10_f32_fast(float %in) {
 define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-LABEL: v_log10_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3020,11 +3019,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_unsafe_math_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3032,11 +3031,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_unsafe_math_attr:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3047,11 +3046,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_unsafe_math_attr:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3059,11 +3058,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_unsafe_math_attr:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3074,11 +3073,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_unsafe_math_attr:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3086,11 +3085,11 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
@@ -3099,16 +3098,17 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_unsafe_math_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_unsafe_math_attr:
@@ -3127,7 +3127,6 @@ define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; SI-SDAG-LABEL: v_log10_f32_approx_fn_attr:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3138,11 +3137,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_approx_fn_attr:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3150,11 +3149,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_approx_fn_attr:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3165,11 +3164,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_approx_fn_attr:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3177,11 +3176,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_approx_fn_attr:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3192,11 +3191,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_approx_fn_attr:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3204,11 +3203,11 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
@@ -3217,16 +3216,17 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_approx_fn_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_approx_fn_attr:
@@ -3245,7 +3245,6 @@ define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 define float @v_log10_f32_ninf(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3264,11 +3263,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3287,11 +3286,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3313,11 +3312,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3339,11 +3338,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3362,11 +3361,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -3385,11 +3384,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3407,11 +3406,11 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3429,6 +3428,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_ninf:
@@ -3447,7 +3447,6 @@ define float @v_log10_f32_ninf(float %in) {
 define float @v_log10_f32_afn(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3458,11 +3457,11 @@ define float @v_log10_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3470,11 +3469,11 @@ define float @v_log10_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3485,11 +3484,11 @@ define float @v_log10_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3497,11 +3496,11 @@ define float @v_log10_f32_afn(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3512,11 +3511,11 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3524,11 +3523,11 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
@@ -3537,16 +3536,17 @@ define float @v_log10_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_afn:
@@ -3565,17 +3565,17 @@ define float @v_log10_f32_afn(float %in) {
 define float @v_log10_f32_afn_daz(float %in) #0 {
 ; GFX689-LABEL: v_log10_f32_afn_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_f32_afn_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_afn_daz:
@@ -3594,7 +3594,6 @@ define float @v_log10_f32_afn_daz(float %in) #0 {
 define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log10_f32_afn_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3605,11 +3604,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_afn_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3617,11 +3616,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_afn_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3632,11 +3631,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_afn_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3644,11 +3643,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_afn_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3659,11 +3658,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_afn_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3671,11 +3670,11 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo
@@ -3684,16 +3683,17 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_afn_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v1, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_afn_dynamic:
@@ -3712,7 +3712,6 @@ define float @v_log10_f32_afn_dynamic(float %in) #1 {
 define float @v_fabs_log10_f32_afn(float %in) {
 ; SI-SDAG-LABEL: v_fabs_log10_f32_afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3723,11 +3722,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_fabs_log10_f32_afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3735,11 +3734,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_fabs_log10_f32_afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3750,11 +3749,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_fabs_log10_f32_afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3762,11 +3761,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_fabs_log10_f32_afn:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -3777,11 +3776,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209b
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_fabs_log10_f32_afn:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e64 v2, |v0|
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc11a209b
@@ -3789,11 +3788,11 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209b
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v2, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_fabs_log10_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -3803,17 +3802,18 @@ define float @v_fabs_log10_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3e9a209b, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_fabs_log10_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e64 v1, |v0|
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 0xc11a209b, s0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v0, 0x3e9a209b, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fabs_log10_f32_afn:
@@ -3833,7 +3833,6 @@ define float @v_fabs_log10_f32_afn(float %in) {
 define float @v_log10_f32_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log10_f32_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -3844,11 +3843,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -3859,11 +3858,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -3877,11 +3876,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -3895,11 +3894,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -3910,11 +3909,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -3925,11 +3924,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -3940,11 +3939,11 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -3955,6 +3954,7 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_daz:
@@ -3973,7 +3973,6 @@ define float @v_log10_f32_daz(float %in) #0 {
 define float @v_log10_f32_nnan(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32_nnan:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -3992,11 +3991,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4015,11 +4014,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_nnan:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4041,11 +4040,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_nnan:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4067,11 +4066,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4090,11 +4089,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4113,11 +4112,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4135,11 +4134,11 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4157,6 +4156,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan:
@@ -4175,7 +4175,6 @@ define float @v_log10_f32_nnan(float %in) {
 define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log10_f32_nnan_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -4186,11 +4185,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -4201,11 +4200,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_nnan_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -4219,11 +4218,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_nnan_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -4237,11 +4236,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -4252,11 +4251,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -4267,11 +4266,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -4282,11 +4281,11 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -4297,6 +4296,7 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan_daz:
@@ -4315,7 +4315,6 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log10_f32_nnan_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4334,11 +4333,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4357,11 +4356,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_nnan_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4383,11 +4382,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_nnan_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4409,11 +4408,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4432,11 +4431,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4455,11 +4454,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4477,11 +4476,11 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4499,6 +4498,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan_dynamic:
@@ -4517,7 +4517,6 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log10_f32_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -4528,11 +4527,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -4543,11 +4542,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_ninf_daz:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -4561,11 +4560,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_ninf_daz:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
@@ -4579,11 +4578,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -4594,11 +4593,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -4609,11 +4608,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_ninf_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -4624,11 +4623,11 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_ninf_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -4639,6 +4638,7 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_ninf_daz:
@@ -4657,7 +4657,6 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log10_f32_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4676,11 +4675,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4699,11 +4698,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4725,11 +4724,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4751,11 +4750,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4774,11 +4773,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4797,11 +4796,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4819,11 +4818,11 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4841,6 +4840,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_ninf_dynamic:
@@ -4859,7 +4859,6 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-SDAG-LABEL: v_log10_f32_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4875,11 +4874,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4895,11 +4894,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4918,11 +4917,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4941,11 +4940,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan_ninf:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -4961,11 +4960,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -4981,11 +4980,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5000,11 +4999,11 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5019,6 +5018,7 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan_ninf:
@@ -5037,7 +5037,6 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-SDAG-LABEL: v_log10_f32_nnan_ninf_daz:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5045,11 +5044,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3284fbcf
 ; SI-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5057,11 +5056,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_f32_nnan_ninf_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
@@ -5072,11 +5071,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan_ninf_daz:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5084,11 +5083,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3284fbcf
 ; GFX900-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5096,11 +5095,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf_daz:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5109,11 +5108,11 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-SDAG-NEXT:    v_fmamk_f32 v0, v0, 0x3284fbcf, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf_daz:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5122,6 +5121,7 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan_ninf_daz:
@@ -5140,7 +5140,6 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5156,11 +5155,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5176,11 +5175,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5199,11 +5198,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5222,11 +5221,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5242,11 +5241,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5262,11 +5261,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5281,11 +5280,11 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_nnan_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5300,6 +5299,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_nnan_ninf_dynamic:
@@ -5318,17 +5318,17 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 define float @v_log10_f32_fast_daz(float %in) #0 {
 ; GFX689-LABEL: v_log10_f32_fast_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_f32_fast_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_fast_daz:
@@ -5347,7 +5347,6 @@ define float @v_log10_f32_fast_daz(float %in) #0 {
 define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-LABEL: v_log10_f32_dynamic_mode:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5366,11 +5365,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_dynamic_mode:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5389,11 +5388,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_dynamic_mode:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5415,11 +5414,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_dynamic_mode:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5441,11 +5440,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_dynamic_mode:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -5464,11 +5463,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_dynamic_mode:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -5487,11 +5486,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5509,11 +5508,11 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_dynamic_mode:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5531,6 +5530,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_dynamic_mode:
@@ -5549,7 +5549,6 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 define float @v_log10_f32_undef() {
 ; SI-SDAG-LABEL: v_log10_f32_undef:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5560,11 +5559,11 @@ define float @v_log10_f32_undef() {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_undef:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5583,11 +5582,11 @@ define float @v_log10_f32_undef() {
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_undef:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5601,11 +5600,11 @@ define float @v_log10_f32_undef() {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_undef:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5627,11 +5626,11 @@ define float @v_log10_f32_undef() {
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_undef:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, s4
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5642,11 +5641,11 @@ define float @v_log10_f32_undef() {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_undef:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -5665,11 +5664,11 @@ define float @v_log10_f32_undef() {
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_undef:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5680,11 +5679,11 @@ define float @v_log10_f32_undef() {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_undef:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5702,6 +5701,7 @@ define float @v_log10_f32_undef() {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_undef:
@@ -5720,7 +5720,6 @@ define float @v_log10_f32_undef() {
 define float @v_log10_f32_0() {
 ; SI-SDAG-LABEL: v_log10_f32_0:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5732,11 +5731,11 @@ define float @v_log10_f32_0() {
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-SDAG-NEXT:    v_add_f32_e32 v0, 0xc11a209b, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_0:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5752,11 +5751,11 @@ define float @v_log10_f32_0() {
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_0:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
@@ -5771,11 +5770,11 @@ define float @v_log10_f32_0() {
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; VI-SDAG-NEXT:    v_add_f32_e32 v0, 0xc11a209b, v0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_0:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v0
@@ -5794,11 +5793,11 @@ define float @v_log10_f32_0() {
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_0:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5810,11 +5809,11 @@ define float @v_log10_f32_0() {
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v0, 0xc11a209b, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_0:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
@@ -5830,11 +5829,11 @@ define float @v_log10_f32_0() {
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_0:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
@@ -5847,11 +5846,11 @@ define float @v_log10_f32_0() {
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, 0xc11a209b, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_0:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, 0
 ; GFX1100-GISEL-NEXT:    v_cmp_lt_f32_e64 s0, 0, 0x800000
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
@@ -5866,6 +5865,7 @@ define float @v_log10_f32_0() {
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_0:
@@ -5884,7 +5884,6 @@ define float @v_log10_f32_0() {
 define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; SI-SDAG-LABEL: v_log10_f32_from_fpext_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; SI-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5896,11 +5895,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; SI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; SI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_from_fpext_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -5912,11 +5911,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_from_fpext_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
 ; VI-SDAG-NEXT:    v_log_f32_e32 v0, v0
@@ -5931,11 +5930,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_from_fpext_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -5950,11 +5949,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_from_fpext_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
 ; GFX900-SDAG-NEXT:    s_mov_b32 s5, 0x3284fbcf
@@ -5966,11 +5965,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_from_fpext_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -5982,11 +5981,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_from_fpext_f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
@@ -5999,11 +5998,11 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_from_fpext_f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -6016,6 +6015,7 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_f16:
@@ -6036,7 +6036,6 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-LABEL: v_log10_f32_from_fpext_math_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
@@ -6058,11 +6057,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f32_from_fpext_math_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
@@ -6078,11 +6077,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_f32_from_fpext_math_f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
@@ -6098,11 +6097,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; VI-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; VI-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_f32_from_fpext_math_f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7f800000
@@ -6118,11 +6117,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_f32_from_fpext_math_f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    s_mov_b32 s4, 0x3e9a209a
@@ -6135,11 +6134,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX900-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s6
 ; GFX900-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_f32_from_fpext_math_f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
@@ -6152,11 +6151,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_from_fpext_math_f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6170,11 +6169,11 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_f32_from_fpext_math_f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6188,6 +6187,7 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_math_f16:
@@ -6210,7 +6210,6 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; SI-LABEL: v_log10_f32_from_fpext_bf16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x800000
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; SI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6229,11 +6228,11 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_f32_from_fpext_bf16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x800000
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; VI-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6255,11 +6254,11 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; VI-NEXT:    v_sub_f32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX900-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -6278,11 +6277,11 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6300,6 +6299,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_bf16:
@@ -6319,44 +6319,44 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
 define half @v_log10_f16(half %in) {
 ; SI-SDAG-LABEL: v_log10_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f16:
@@ -6375,44 +6375,44 @@ define half @v_log10_f16(half %in) {
 define half @v_log10_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log10_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, |v0|
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fabs_f16:
@@ -6432,44 +6432,44 @@ define half @v_log10_fabs_f16(half %in) {
 define half @v_log10_fneg_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_fneg_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -|v0|
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_fneg_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_fneg_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_fabs_f16:
@@ -6490,44 +6490,44 @@ define half @v_log10_fneg_fabs_f16(half %in) {
 define half @v_log10_fneg_f16(half %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_fneg_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_fneg_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_f16:
@@ -6547,44 +6547,44 @@ define half @v_log10_fneg_f16(half %in) {
 define half @v_log10_f16_fast(half %in) {
 ; SI-SDAG-LABEL: v_log10_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3e9a209b, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
 ; VI-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f16_fast:
@@ -6603,7 +6603,6 @@ define half @v_log10_f16_fast(half %in) {
 define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -6616,11 +6615,11 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -6629,43 +6628,43 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_v2f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_v2f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6674,6 +6673,7 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v2f16:
@@ -6692,7 +6692,6 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
@@ -6705,11 +6704,11 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6723,22 +6722,22 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6746,32 +6745,32 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6780,11 +6779,11 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -6795,6 +6794,7 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fabs_v2f16:
@@ -6814,7 +6814,6 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6831,11 +6830,11 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6849,22 +6848,22 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6872,32 +6871,32 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -6906,11 +6905,11 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -6921,6 +6920,7 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_fabs_v2f16:
@@ -6941,7 +6941,6 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_fneg_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -6958,11 +6957,11 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_fneg_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -6976,22 +6975,22 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_fneg_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_fneg_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -6999,32 +6998,32 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_fneg_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -7033,11 +7032,11 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_fneg_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -7048,6 +7047,7 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_v2f16:
@@ -7067,7 +7067,6 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v2f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -7080,11 +7079,11 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
@@ -7093,43 +7092,43 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209b, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v2f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x34d1
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_v2f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_v2f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -7138,6 +7137,7 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v2f16_fast:
@@ -7156,7 +7156,6 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v3f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -7175,11 +7174,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v3f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7192,11 +7191,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_v3f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v2, v0
 ; VI-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_log_f16_e32 v1, v1
@@ -7205,11 +7204,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_v3f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_log_f16_e32 v1, v1
@@ -7217,11 +7216,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_v3f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -7233,6 +7232,7 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v3f16:
@@ -7251,7 +7251,6 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v3f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -7270,11 +7269,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v3f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7287,11 +7286,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_v3f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v2, v0
 ; VI-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_log_f16_e32 v1, v1
@@ -7300,11 +7299,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log10_v3f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_log_f16_e32 v1, v1
@@ -7312,11 +7311,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log10_v3f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -7328,6 +7327,7 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v3f16_fast:
@@ -7346,7 +7346,6 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v4f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -7371,11 +7370,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v4f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7392,11 +7391,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v4f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7408,11 +7407,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v3, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_v4f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
@@ -7424,11 +7423,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7439,11 +7438,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
@@ -7454,11 +7453,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_v4f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -7475,11 +7474,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_v4f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -7496,6 +7495,7 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v4f16:
@@ -7514,7 +7514,6 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log10_v4f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -7539,11 +7538,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log10_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -7560,11 +7559,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log10_v4f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7576,11 +7575,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v3, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log10_v4f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
@@ -7592,11 +7591,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
@@ -7607,11 +7606,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
@@ -7622,11 +7621,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_v4f16_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -7643,11 +7642,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log10_v4f16_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -7664,6 +7663,7 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 6ccef4c02ab3b10..61d9eaa29275885 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1208,7 +1208,6 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 define float @v_log2_f32(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1218,11 +1217,11 @@ define float @v_log2_f32(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1232,11 +1231,11 @@ define float @v_log2_f32(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1245,11 +1244,11 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1258,6 +1257,7 @@ define float @v_log2_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32:
@@ -1276,7 +1276,6 @@ define float @v_log2_f32(float %in) {
 define float @v_log2_fabs_f32(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1286,11 +1285,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_fabs_f32:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -1300,11 +1299,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -1314,11 +1313,11 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -1328,6 +1327,7 @@ define float @v_log2_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fabs_f32:
@@ -1347,7 +1347,6 @@ define float @v_log2_fabs_f32(float %in) {
 define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1357,11 +1356,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_fneg_fabs_f32:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -|v0|, v1
@@ -1371,11 +1370,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e64 s0, 0x80800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -1385,11 +1384,11 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fneg_fabs_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -|v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -1399,6 +1398,7 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_fabs_f32:
@@ -1419,7 +1419,6 @@ define float @v_log2_fneg_fabs_f32(float %in) {
 define float @v_log2_fneg_f32(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x80800000
 ; GFX689-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1429,11 +1428,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_fneg_f32:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, v1
@@ -1443,11 +1442,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_f32:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x80800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1456,11 +1455,11 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fneg_f32:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, -v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -1470,6 +1469,7 @@ define float @v_log2_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_f32:
@@ -1489,7 +1489,6 @@ define float @v_log2_fneg_f32(float %in) {
 define float @v_log2_f32_fast(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32_fast:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1499,11 +1498,11 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_fast:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1513,11 +1512,11 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1526,11 +1525,11 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1539,6 +1538,7 @@ define float @v_log2_f32_fast(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_fast:
@@ -1557,7 +1557,6 @@ define float @v_log2_f32_fast(float %in) {
 define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX689-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1567,11 +1566,11 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1581,11 +1580,11 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1594,11 +1593,11 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_unsafe_math_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1607,6 +1606,7 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_unsafe_math_attr:
@@ -1625,7 +1625,6 @@ define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
 define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" {
 ; GFX689-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1635,11 +1634,11 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1649,11 +1648,11 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1662,11 +1661,11 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_approx_fn_attr:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1675,6 +1674,7 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_approx_fn_attr:
@@ -1693,7 +1693,6 @@ define float @v_log2_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true"
 define float @v_log2_f32_ninf(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1703,11 +1702,11 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_ninf:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1717,11 +1716,11 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1730,11 +1729,11 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1743,6 +1742,7 @@ define float @v_log2_f32_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_ninf:
@@ -1761,7 +1761,6 @@ define float @v_log2_f32_ninf(float %in) {
 define float @v_log2_f32_afn(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32_afn:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1771,11 +1770,11 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_afn:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1785,11 +1784,11 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1798,11 +1797,11 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1811,6 +1810,7 @@ define float @v_log2_f32_afn(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_afn:
@@ -1829,14 +1829,14 @@ define float @v_log2_f32_afn(float %in) {
 define float @v_log2_f32_afn_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_afn_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_afn_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_afn_daz:
@@ -1855,7 +1855,6 @@ define float @v_log2_f32_afn_daz(float %in) #0 {
 define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX689-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1865,11 +1864,11 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_afn_dynamic:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -1879,11 +1878,11 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_afn_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -1892,11 +1891,11 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_afn_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1905,6 +1904,7 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_afn_dynamic:
@@ -1923,7 +1923,6 @@ define float @v_log2_f32_afn_dynamic(float %in) #1 {
 define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX689-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -1933,11 +1932,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_fabs_log2_f32_afn:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
@@ -1947,11 +1946,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_fabs_log2_f32_afn:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s0
@@ -1961,11 +1960,11 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_fabs_log2_f32_afn:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0
@@ -1975,6 +1974,7 @@ define float @v_fabs_log2_f32_afn(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_fabs_log2_f32_afn:
@@ -1994,14 +1994,14 @@ define float @v_fabs_log2_f32_afn(float %in) {
 define float @v_log2_f32_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_daz:
@@ -2020,7 +2020,6 @@ define float @v_log2_f32_daz(float %in) #0 {
 define float @v_log2_f32_nnan(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2030,11 +2029,11 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_nnan:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2044,11 +2043,11 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2057,11 +2056,11 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_nnan:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2070,6 +2069,7 @@ define float @v_log2_f32_nnan(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan:
@@ -2088,14 +2088,14 @@ define float @v_log2_f32_nnan(float %in) {
 define float @v_log2_f32_nnan_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_nnan_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_nnan_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan_daz:
@@ -2114,7 +2114,6 @@ define float @v_log2_f32_nnan_daz(float %in) #0 {
 define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX689-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2124,11 +2123,11 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2138,11 +2137,11 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2151,11 +2150,11 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_nnan_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2164,6 +2163,7 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan_dynamic:
@@ -2182,14 +2182,14 @@ define float @v_log2_f32_nnan_dynamic(float %in) #1 {
 define float @v_log2_f32_ninf_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_ninf_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_ninf_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_ninf_daz:
@@ -2208,7 +2208,6 @@ define float @v_log2_f32_ninf_daz(float %in) #0 {
 define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX689-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2218,11 +2217,11 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2232,11 +2231,11 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2245,11 +2244,11 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2258,6 +2257,7 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_ninf_dynamic:
@@ -2276,7 +2276,6 @@ define float @v_log2_f32_ninf_dynamic(float %in) #1 {
 define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2286,11 +2285,11 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2300,11 +2299,11 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2313,11 +2312,11 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_nnan_ninf:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2326,6 +2325,7 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan_ninf:
@@ -2344,14 +2344,14 @@ define float @v_log2_f32_nnan_ninf(float %in) {
 define float @v_log2_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_nnan_ninf_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_nnan_ninf_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan_ninf_daz:
@@ -2370,7 +2370,6 @@ define float @v_log2_f32_nnan_ninf_daz(float %in) #0 {
 define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX689-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2380,11 +2379,11 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2394,11 +2393,11 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2407,11 +2406,11 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_nnan_ninf_dynamic:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2420,6 +2419,7 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_nnan_ninf_dynamic:
@@ -2438,14 +2438,14 @@ define float @v_log2_f32_nnan_ninf_dynamic(float %in) #1 {
 define float @v_log2_f32_fast_daz(float %in) #0 {
 ; GFX689-LABEL: v_log2_f32_fast_daz:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_fast_daz:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_fast_daz:
@@ -2464,7 +2464,6 @@ define float @v_log2_f32_fast_daz(float %in) #0 {
 define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX689-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2474,11 +2473,11 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX689-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_dynamic_mode:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
@@ -2488,11 +2487,11 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_dynamic_mode:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2501,11 +2500,11 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_dynamic_mode:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2514,6 +2513,7 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_dynamic_mode:
@@ -2532,13 +2532,12 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 {
 define float @v_log2_f32_undef() {
 ; GFX689-SDAG-LABEL: v_log2_f32_undef:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    v_log_f32_e32 v0, s4
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_undef:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX689-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v1
@@ -2548,17 +2547,17 @@ define float @v_log2_f32_undef() {
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_undef:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, s0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_undef:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -2568,6 +2567,7 @@ define float @v_log2_f32_undef() {
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_undef:
@@ -2586,29 +2586,29 @@ define float @v_log2_f32_undef() {
 define float @v_log2_f32_0() {
 ; GFX689-SDAG-LABEL: v_log2_f32_0:
 ; GFX689-SDAG:       ; %bb.0:
-; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX689-SDAG-NEXT:    v_add_f32_e32 v0, 0xc2000000, v0
+; GFX689-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX689-GISEL-LABEL: v_log2_f32_0:
 ; GFX689-GISEL:       ; %bb.0:
-; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    v_mov_b32_e32 v0, 0xff800000
+; GFX689-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_f32_0:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, 0
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_add_f32_e32 v0, 0xc2000000, v0
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_f32_0:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_mov_b32_e32 v0, 0xff800000
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_0:
@@ -2627,17 +2627,17 @@ define float @v_log2_f32_0() {
 define float @v_log2_f32_from_fpext_f16(i16 %src.i) {
 ; GFX689-LABEL: v_log2_f32_from_fpext_f16:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_from_fpext_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_f16:
@@ -2658,7 +2658,6 @@ define float @v_log2_f32_from_fpext_f16(i16 %src.i) {
 define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-LABEL: v_log2_f32_from_fpext_math_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0x800000
@@ -2671,42 +2670,43 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-SDAG-NEXT:    v_sub_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_f32_from_fpext_math_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_f32_from_fpext_math_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_log_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_f32_from_fpext_math_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_log_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_from_fpext_math_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_math_f16:
@@ -2729,7 +2729,6 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX689-LABEL: v_log2_f32_from_fpext_bf16:
 ; GFX689:       ; %bb.0:
-; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_mov_b32 s4, 0x800000
 ; GFX689-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
 ; GFX689-NEXT:    v_mov_b32_e32 v2, 0x4f800000
@@ -2739,11 +2738,11 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX689-NEXT:    v_mov_b32_e32 v1, 0x42000000
 ; GFX689-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX689-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX689-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -2752,6 +2751,7 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 ; GFX1100-NEXT:    v_log_f32_e32 v0, v0
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_bf16:
@@ -2771,38 +2771,38 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
 define half @v_log2_f16(half %in) {
 ; SI-SDAG-LABEL: v_log2_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f16:
@@ -2821,38 +2821,38 @@ define half @v_log2_f16(half %in) {
 define half @v_log2_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log2_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, |v0|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fabs_f16:
@@ -2872,38 +2872,38 @@ define half @v_log2_fabs_f16(half %in) {
 define half @v_log2_fneg_fabs_f16(half %in) {
 ; SI-SDAG-LABEL: v_log2_fneg_fabs_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fneg_fabs_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_fneg_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -|v0|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_fneg_fabs_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_fneg_fabs_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_fabs_f16:
@@ -2924,38 +2924,38 @@ define half @v_log2_fneg_fabs_f16(half %in) {
 define half @v_log2_fneg_f16(half %in) {
 ; SI-SDAG-LABEL: v_log2_fneg_f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fneg_f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e64 v0, -v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_fneg_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e64 v0, -v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_fneg_f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_f16:
@@ -2975,38 +2975,38 @@ define half @v_log2_fneg_f16(half %in) {
 define half @v_log2_f16_fast(half %in) {
 ; SI-SDAG-LABEL: v_log2_f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_log_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log2_f16_fast:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_log_f16_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_log2_f16_fast:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f16_fast:
@@ -3025,7 +3025,6 @@ define half @v_log2_f16_fast(half %in) {
 define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -3036,60 +3035,61 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_v2f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v2f16:
@@ -3108,7 +3108,6 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
@@ -3119,11 +3118,11 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3135,56 +3134,56 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, |v1|
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -3192,6 +3191,7 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fabs_v2f16:
@@ -3211,7 +3211,6 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3226,11 +3225,11 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3242,56 +3241,56 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -|v1|
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -3299,6 +3298,7 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_fabs_v2f16:
@@ -3319,7 +3319,6 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_fneg_v2f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -3334,11 +3333,11 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_fneg_v2f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3350,56 +3349,56 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_fneg_v2f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_fneg_v2f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_fneg_v2f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -v1
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_fneg_v2f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
@@ -3407,6 +3406,7 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_v2f16:
@@ -3426,7 +3426,6 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v2f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -3437,60 +3436,61 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v2f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v2f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_v2f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v2f16_fast:
@@ -3509,7 +3509,6 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v3f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3525,11 +3524,11 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v3f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -3539,47 +3538,47 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v3f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v3f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_v3f16:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -3587,6 +3586,7 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX1100-NEXT:    v_log_f16_e32 v2, v2
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v3f16:
@@ -3605,7 +3605,6 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v3f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3621,11 +3620,11 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v3f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -3635,47 +3634,47 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v3f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v3f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-LABEL: v_log2_v3f16_fast:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_log_f16_e32 v1, v1
@@ -3683,6 +3682,7 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX1100-NEXT:    v_log_f16_e32 v2, v2
 ; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v3f16_fast:
@@ -3701,7 +3701,6 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v4f16:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3722,11 +3721,11 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v4f16:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -3739,55 +3738,55 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v4f16:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v3
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v4f16:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v4f16:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_v4f16:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -3798,11 +3797,11 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_v4f16:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -3813,6 +3812,7 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v4f16:
@@ -3831,7 +3831,6 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-LABEL: v_log2_v4f16_fast:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3852,11 +3851,11 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_log2_v4f16_fast:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -3869,55 +3868,55 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_log2_v4f16_fast:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; VI-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; VI-SDAG-NEXT:    v_or_b32_e32 v0, v0, v3
 ; VI-SDAG-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_log2_v4f16_fast:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
 ; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log2_v4f16_fast:
 ; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
@@ -3928,11 +3927,11 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-LABEL: v_log2_v4f16_fast:
 ; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
@@ -3943,6 +3942,7 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index ab527481235ca94..011a8ecc7b8a63d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -7,7 +7,6 @@
 define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-LABEL: umulo_i64_v_v:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
 ; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
@@ -26,11 +25,11 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: umulo_i64_v_v:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
@@ -48,11 +47,11 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: umulo_i64_v_v:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
@@ -70,11 +69,11 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: umulo_i64_v_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
@@ -96,6 +95,7 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
@@ -105,7 +105,6 @@ bb:
 define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-LABEL: smulo_i64_v_v:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
 ; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
@@ -138,11 +137,11 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v1, v5
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_v:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
@@ -172,11 +171,11 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v4
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: smulo_i64_v_v:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
@@ -206,11 +205,11 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: smulo_i64_v_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v2, 0
@@ -247,6 +246,7 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
@@ -572,7 +572,6 @@ bb:
 define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; SI-LABEL: smulo_i64_v_4:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
 ; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
 ; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
@@ -580,11 +579,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, v5
 ; SI-NEXT:    v_mov_b32_e32 v1, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: smulo_i64_v_4:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
@@ -592,11 +591,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: smulo_i64_v_4:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX10-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX10-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
@@ -604,11 +603,11 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: smulo_i64_v_4:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -617,6 +616,7 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
@@ -626,7 +626,6 @@ bb:
 define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; SI-LABEL: umulo_i64_v_4:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
 ; SI-NEXT:    v_mov_b32_e32 v6, v0
 ; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
@@ -635,11 +634,11 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: umulo_i64_v_4:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
@@ -648,11 +647,11 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: umulo_i64_v_4:
 ; GFX10:       ; %bb.0: ; %bb
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
@@ -661,11 +660,11 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: umulo_i64_v_4:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
 ; GFX11-NEXT:    v_alignbit_b32 v3, v1, v0, 30
@@ -673,6 +672,7 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) {
 ; GFX11-NEXT:    v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
index a467e29d0bff63a..c09eaa2a84347d8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
@@ -6,18 +6,17 @@
 define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX78-LABEL: v_powi_f16:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX78-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX78-NEXT:    v_log_f32_e32 v0, v0
 ; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX78-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -27,6 +26,7 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %l.cast = bitcast i16 %l to half
   %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r)
@@ -37,22 +37,22 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 define float @v_powi_f32(float %l, i32 %r) {
 ; GFX78-LABEL: v_powi_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_log_f32_e32 v0, v0
 ; GFX78-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX78-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX78-NEXT:    v_exp_f32_e32 v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_log_f32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 %r)
   ret float %res
@@ -61,14 +61,14 @@ define float @v_powi_f32(float %l, i32 %r) {
 define float @v_powi_0_f32(float %l) {
 ; GFX78-LABEL: v_powi_0_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_0_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 0)
   ret float %res
@@ -91,7 +91,6 @@ define float @v_powi_1_f32(float %l) {
 define float @v_powi_neg1_f32(float %l) {
 ; GFX7-LABEL: v_powi_neg1_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX7-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -103,11 +102,11 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_powi_neg1_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; GFX8-NEXT:    v_rcp_f32_e32 v3, v1
@@ -119,11 +118,11 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg1_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
@@ -140,6 +139,7 @@ define float @v_powi_neg1_f32(float %l) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -1)
   ret float %res
@@ -148,14 +148,14 @@ define float @v_powi_neg1_f32(float %l) {
 define float @v_powi_2_f32(float %l) {
 ; GFX78-LABEL: v_powi_2_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_2_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 2)
   ret float %res
@@ -164,7 +164,6 @@ define float @v_powi_2_f32(float %l) {
 define float @v_powi_neg2_f32(float %l) {
 ; GFX7-LABEL: v_powi_neg2_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX7-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX7-NEXT:    v_rcp_f32_e32 v2, v1
@@ -177,11 +176,11 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_powi_neg2_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX8-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; GFX8-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
@@ -194,11 +193,11 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg2_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, 1.0
@@ -217,6 +216,7 @@ define float @v_powi_neg2_f32(float %l) {
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -2)
   ret float %res
@@ -225,17 +225,17 @@ define float @v_powi_neg2_f32(float %l) {
 define float @v_powi_4_f32(float %l) {
 ; GFX78-LABEL: v_powi_4_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_4_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 4)
   ret float %res
@@ -244,19 +244,19 @@ define float @v_powi_4_f32(float %l) {
 define float @v_powi_8_f32(float %l) {
 ; GFX78-LABEL: v_powi_8_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_8_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 8)
   ret float %res
@@ -265,22 +265,22 @@ define float @v_powi_8_f32(float %l) {
 define float @v_powi_16_f32(float %l) {
 ; GFX78-LABEL: v_powi_16_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_16_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 16)
   ret float %res
@@ -289,7 +289,6 @@ define float @v_powi_16_f32(float %l) {
 define float @v_powi_128_f32(float %l) {
 ; GFX78-LABEL: v_powi_128_f32:
 ; GFX78:       ; %bb.0:
-; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
@@ -297,11 +296,11 @@ define float @v_powi_128_f32(float %l) {
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX78-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX78-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_128_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
@@ -312,6 +311,7 @@ define float @v_powi_128_f32(float %l) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 128)
   ret float %res
@@ -320,7 +320,6 @@ define float @v_powi_128_f32(float %l) {
 define float @v_powi_neg128_f32(float %l) {
 ; GFX7-LABEL: v_powi_neg128_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v0
@@ -339,11 +338,11 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX7-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; GFX7-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX7-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_powi_neg128_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v0
@@ -362,11 +361,11 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX8-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; GFX8-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_powi_neg128_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v0
@@ -394,6 +393,7 @@ define float @v_powi_neg128_f32(float %l) {
 ; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = call float @llvm.powi.f32.i32(float %l, i32 -128)
   ret float %res
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
index ba025a2202313fb..af124709481123d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -7,7 +7,6 @@
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v2, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
@@ -16,41 +15,36 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %
 ; GFX900-NEXT:    ds_write_b16 v0, v2
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v1, v0
 ; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    ds_write_b16 v2, v1
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v1, v0
 ; GFX803-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    ds_write_b16 v2, v1
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v2, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, v2
@@ -59,7 +53,7 @@ define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(ptr addrspace(3) noalias %
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
@@ -74,59 +68,51 @@ entry:
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(ptr addrspace(3) noalias %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v1, v0 offset:16
 ; GFX900-NEXT:    ds_read_u16 v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_write_b16 v2, v1
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v1, v0 offset:16
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    ds_write_b16 v2, v1
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v1, v0 offset:16
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    ds_write_b16 v2, v1
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_hi:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0 offset:16
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v2, v1
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
@@ -141,59 +127,51 @@ entry:
 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out0, ptr addrspace(3) noalias %out1) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v3, v0
 ; GFX900-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_write_b16 v1, v3
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-NEXT:    ds_write_b16 v2, v0
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v3, v0
 ; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    ds_write_b16 v1, v3
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX906-NEXT:    ds_write_b16 v2, v0
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v3, v0
 ; GFX803-NEXT:    ds_read_u16 v0, v0 offset:16
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    ds_write_b16 v1, v3
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX803-NEXT:    ds_write_b16 v2, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_multi_use_lohi:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v3, v0
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v3
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v2, v0
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v0, v3, s0
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
@@ -209,33 +187,31 @@ entry:
 define <2 x i16> @load_local_hi_v2i16_undeflo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_undeflo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v0, v0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_undeflo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_undeflo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_undeflo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v0, v0
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -246,37 +222,37 @@ entry:
 define <2 x i16> @load_local_hi_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_reglo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_reglo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -288,43 +264,39 @@ entry:
 define void @load_local_hi_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -337,37 +309,37 @@ entry:
 define <2 x i16> @load_local_hi_v2i16_zerolo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_zerolo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_zerolo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_zerolo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -379,35 +351,35 @@ entry:
 define i32 @load_local_hi_v2i16_zerolo_shift(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_zerolo_shift:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v0, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_zerolo_shift:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_zerolo_shift:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_zerolo_shift:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -419,43 +391,39 @@ entry:
 define void @load_local_hi_v2f16_reglo_vreg(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, ptr addrspace(3) %in
@@ -468,43 +436,39 @@ entry:
 define void @load_local_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u8_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u8_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -518,43 +482,39 @@ entry:
 define void @load_local_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_i8_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_i8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_i8 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_i8_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -568,43 +528,39 @@ entry:
 define void @load_local_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u8_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u8_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -620,43 +576,39 @@ entry:
 define void @load_local_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_i8_d16_hi v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_i8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_i8 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_i8_d16_hi v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -672,27 +624,24 @@ entry:
 define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -700,16 +649,15 @@ define void @load_global_hi_v2i16_reglo_vreg(ptr addrspace(1) %in, i16 %reg) #0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 -2047
@@ -723,27 +671,24 @@ entry:
 define void @load_global_hi_v2f16_reglo_vreg(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -751,16 +696,15 @@ define void @load_global_hi_v2f16_reglo_vreg(ptr addrspace(1) %in, half %reg) #0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, ptr addrspace(1) %in, i64 -2047
@@ -774,27 +718,24 @@ entry:
 define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -802,16 +743,15 @@ define void @load_global_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i16 %r
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
@@ -826,27 +766,24 @@ entry:
 define void @load_global_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -854,16 +791,15 @@ define void @load_global_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i16 %r
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
@@ -878,27 +814,24 @@ entry:
 define void @load_global_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -906,16 +839,15 @@ define void @load_global_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, half %
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
@@ -931,27 +863,24 @@ entry:
 define void @load_global_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, half %reg) #0 {
 ; GFX900-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -959,16 +888,15 @@ define void @load_global_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, half %
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(1) %in, i64 -4095
@@ -984,42 +912,38 @@ entry:
 define void @load_flat_hi_v2i16_reglo_vreg(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr %in
@@ -1032,42 +956,38 @@ entry:
 define void @load_flat_hi_v2f16_reglo_vreg(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, ptr %in
@@ -1080,42 +1000,38 @@ entry:
 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr %in
@@ -1129,42 +1045,38 @@ entry:
 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(ptr %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr %in
@@ -1178,42 +1090,38 @@ entry:
 define void @load_flat_hi_v2f16_reglo_vreg_zexti8(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_ubyte_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr %in
@@ -1228,42 +1136,38 @@ entry:
 define void @load_flat_hi_v2f16_reglo_vreg_sexti8(ptr %in, half %reg) #0 {
 ; GFX900-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_sbyte_d16_hi v2, v[0:1]
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr %in
@@ -1278,42 +1182,38 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
@@ -1327,42 +1227,38 @@ entry:
 define void @load_private_hi_v2f16_reglo_vreg(ptr addrspace(5) byval(half) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, ptr addrspace(5) %in, i64 2047
@@ -1376,43 +1272,39 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_nooff(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i16, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
@@ -1425,43 +1317,39 @@ entry:
 define void @load_private_hi_v2f16_reglo_vreg_nooff(ptr addrspace(5) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile half, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
@@ -1474,42 +1362,38 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
@@ -1524,42 +1408,38 @@ entry:
 define void @load_private_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
@@ -1575,42 +1455,38 @@ entry:
 define void @load_private_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
@@ -1626,42 +1502,38 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i64 4095
@@ -1676,43 +1548,39 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
@@ -1726,43 +1594,39 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
@@ -1776,43 +1640,39 @@ entry:
 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, half %reg) #0 {
 ; GFX900-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load volatile i8, ptr addrspace(5) inttoptr (i32 4094 to ptr addrspace(5))
@@ -1827,27 +1687,24 @@ entry:
 define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_hi_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_hi_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -1855,16 +1712,15 @@ define void @load_constant_hi_v2i16_reglo_vreg(ptr addrspace(4) %in, i16 %reg) #
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_constant_hi_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 -2047
@@ -1878,27 +1734,24 @@ entry:
 define void @load_constant_hi_v2f16_reglo_vreg(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -1906,16 +1759,15 @@ define void @load_constant_hi_v2f16_reglo_vreg(ptr addrspace(4) %in, half %reg)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:-4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds half, ptr addrspace(4) %in, i64 -2047
@@ -1929,27 +1781,24 @@ entry:
 define void @load_constant_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -1957,16 +1806,15 @@ define void @load_constant_hi_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, half
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
@@ -1982,27 +1830,24 @@ entry:
 define void @load_constant_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, half %reg) #0 {
 ; GFX900-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -2010,16 +1855,15 @@ define void @load_constant_hi_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, half
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_constant_hi_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i8, ptr addrspace(4) %in, i64 -4095
@@ -2038,19 +1882,17 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4058
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2059,12 +1901,11 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2073,19 +1914,18 @@ define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, ptr addrspace(
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:4058
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
@@ -2101,19 +1941,17 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4059
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2122,12 +1960,11 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, ptr add
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2136,19 +1973,18 @@ define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, ptr add
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16_hi v0, off, s32 offset:4059
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
@@ -2165,19 +2001,17 @@ entry:
 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr addrspace(5) %obj0) #0 {
 ; GFX900-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4059
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2186,12 +2020,11 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr add
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2200,19 +2033,18 @@ define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, ptr add
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword v1, v2, off
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16_hi v0, off, s32 offset:4059
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
@@ -2231,43 +2063,43 @@ entry:
 define <2 x i16> @load_local_v2i16_split_multi_chain(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_v2i16_split_multi_chain:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0 offset:2
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_v2i16_split_multi_chain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v1, v0
 ; GFX906-NEXT:    ds_read_u16 v0, v0 offset:2
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_v2i16_split_multi_chain:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v1, v0
 ; GFX803-NEXT:    ds_read_u16 v0, v0 offset:2
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0 offset:2
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
@@ -2281,44 +2113,43 @@ entry:
 define <2 x i16> @load_local_lo_hi_v2i16_samechain(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_samechain:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_hi_v2i16_samechain:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v1, v0
 ; GFX906-NEXT:    ds_read_u16 v0, v0 offset:16
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_hi_v2i16_samechain:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v1, v0 offset:16
 ; GFX803-NEXT:    ds_read_u16 v0, v0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v1, v0
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v1, v0 offset:16
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
@@ -2333,39 +2164,39 @@ entry:
 define <2 x i16> @load_local_v2i16_broadcast(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_v2i16_broadcast:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v0, v0
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_v2i16_broadcast:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_v2i16_broadcast:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_v2i16_broadcast:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v0, v0, s0
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 1
@@ -2378,19 +2209,18 @@ entry:
 define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr addrspace(3) %may.alias) #0 {
 ; GFX900-LABEL: load_local_lo_hi_v2i16_side_effect:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16 v2, v0
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX900-NEXT:    ds_write_b16 v1, v3
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16_hi v2, v0 offset:16
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_hi_v2i16_side_effect:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX906-NEXT:    ds_read_u16 v2, v0
 ; GFX906-NEXT:    ds_write_b16 v1, v3
@@ -2398,11 +2228,11 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr a
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_hi_v2i16_side_effect:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX803-NEXT:    ds_read_u16 v2, v0
@@ -2411,18 +2241,19 @@ define <2 x i16> @load_local_lo_hi_v2i16_side_effect(ptr addrspace(3) %in, ptr a
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v2, v0
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0x7b
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v3
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v2, v0 offset:16
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(3) %in, i32 8
@@ -2438,28 +2269,27 @@ entry:
 define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
 ; GFX900-LABEL: load_global_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_v2i16_split:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v3, v[0:1], off offset:2 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX906-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_v2i16_split:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
@@ -2468,16 +2298,17 @@ define <2 x i16> @load_global_v2i16_split(ptr addrspace(1) %in) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_global_v2i16_split:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 1
@@ -2492,17 +2323,16 @@ entry:
 define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
 ; GFX900-LABEL: load_flat_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16_hi v2, v[0:1] offset:2 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_v2i16_split:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v3, v[0:1] offset:2 glc
@@ -2510,11 +2340,11 @@ define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX906-NEXT:    s_waitcnt expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_v2i16_split:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
@@ -2523,16 +2353,17 @@ define <2 x i16> @load_flat_v2i16_split(ptr %in) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_flat_v2i16_split:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_ushort v2, v[0:1] glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    flat_load_short_d16_hi v2, v[0:1] offset:2 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr %in, i64 1
@@ -2547,27 +2378,26 @@ entry:
 define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
 ; GFX900-LABEL: load_constant_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX900-NEXT:    s_nop 0
 ; GFX900-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_v2i16_split:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX906-NEXT:    global_load_ushort v3, v[0:1], off offset:2 glc
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v3, v2, s4
+; GFX906-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_v2i16_split:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1] glc
@@ -2575,16 +2405,17 @@ define <2 x i16> @load_constant_v2i16_split(ptr addrspace(4) %in) #0 {
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_constant_v2i16_split:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off glc
 ; GFX900-FLATSCR-NEXT:    s_nop 0
 ; GFX900-FLATSCR-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:2 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(4) %in, i64 1
@@ -2600,42 +2431,40 @@ entry:
 define <2 x i16> @load_private_v2i16_split(ptr addrspace(5) byval(i16) %in) #0 {
 ; GFX900-LABEL: load_private_v2i16_split:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 glc
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_v2i16_split:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_v2i16_split:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_v2i16_split:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ushort v0, off, s32 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s32 offset:2 glc
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
@@ -2652,49 +2481,45 @@ entry:
 define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_hi_v2i16_store_local_lo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX900-NEXT:    ds_read_u16_d16_hi v2, v1
 ; GFX900-NEXT:    ds_write_b16 v1, v0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_hi_v2i16_store_local_lo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v2, v1
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    ds_write_b16 v1, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v2, v2, v0, s4
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v2
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_hi_v2i16_store_local_lo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v2, v1
 ; GFX803-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX803-NEXT:    ds_write_b16 v1, v0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v2, v0, v2, s4
 ; GFX803-NEXT:    v_mov_b32_e32 v0, v2
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_store_local_lo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX900-FLATSCR-NEXT:    ds_read_u16_d16_hi v2, v1
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v1, v0
-; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
index 5843ac77baa9614..75ebc3bb2b8e856 100644
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -7,24 +7,21 @@
 define <2 x i16> @load_local_lo_v2i16_undeflo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_undeflo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16 v0, v0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_undeflo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
-; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_undeflo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -35,39 +32,39 @@ entry:
 define <2 x i16> @load_local_lo_v2i16_reglo(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u16 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reglo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reglo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -80,47 +77,43 @@ entry:
 define void @load_local_lo_v2i16_reglo_vreg(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u16 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -133,27 +126,26 @@ entry:
 define <2 x i16> @load_local_lo_v2i16_zerolo(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_zerolo:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX900-NEXT:    ds_read_u16_d16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_zerolo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_zerolo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
-; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -164,30 +156,30 @@ entry:
 define <2 x half> @load_local_lo_v2f16_fpimm(ptr addrspace(3) %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2f16_fpimm:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX900-NEXT:    ds_read_u16_d16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2f16_fpimm:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_movk_i32 s4, 0x4000
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, s4, v0, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2f16_fpimm:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, 2.0, v0
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, ptr addrspace(3) %in
@@ -198,34 +190,31 @@ entry:
 define void @load_local_lo_v2f16_reghi_vreg(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2f16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u16_d16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2f16_reghi_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2f16_reghi_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -238,47 +227,43 @@ entry:
 define void @load_local_lo_v2f16_reglo_vreg(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u16 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load half, ptr addrspace(3) %in
@@ -291,34 +276,31 @@ entry:
 define void @load_local_lo_v2i16_reghi_vreg_zexti8(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_u8_d16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -332,47 +314,43 @@ entry:
 define void @load_local_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u8 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u8 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -386,34 +364,31 @@ entry:
 define void @load_local_lo_v2i16_reghi_vreg_sexti8(ptr addrspace(3) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    ds_read_i8_d16 v1, v0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_i8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_i8 v0, v0
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -427,47 +402,43 @@ entry:
 define void @load_local_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(3) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_i8 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_i8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_i8 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_i8 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -481,47 +452,43 @@ entry:
 define void @load_local_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u8 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u8 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u8 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -536,47 +503,43 @@ entry:
 define void @load_local_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(3) %in, half %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_i8 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_i8 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_i8 v0, v0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_i8 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i8, ptr addrspace(3) %in
@@ -591,7 +554,6 @@ entry:
 define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <2 x i16> %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u16 v0, v0
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0xffff
@@ -599,12 +561,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
 ; GFX900-MUBUF-NEXT:    ds_write_b16 v2, v0
 ; GFX900-MUBUF-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
@@ -612,12 +573,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
 ; GFX906-NEXT:    ds_write_b16 v2, v0
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    v_mov_b32_e32 v2, 0
@@ -626,12 +586,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
 ; GFX803-NEXT:    ds_write_b16 v2, v0
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
@@ -639,7 +598,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(ptr addrspace(3) %in, <
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v2, v0
 ; GFX900-FLATSCR-NEXT:    v_bfi_b32 v0, s0, v0, v1
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -653,43 +612,40 @@ entry:
 define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(ptr addrspace(3) %in, <2 x i16> %reg) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX900-NEXT:    ds_read_u16_d16 v1, v0
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX900-NEXT:    ds_write_b16 v0, v2
-; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    ds_write_b16 v3, v2
-; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX803-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX803-NEXT:    ds_write_b16 v3, v2
-; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -703,7 +659,6 @@ entry:
 define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noalias %in, <2 x i16> %reg, ptr addrspace(3) noalias %out0, ptr addrspace(3) noalias %out1) #0 {
 ; GFX900-MUBUF-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    ds_read_u16 v0, v0
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX900-MUBUF-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -712,12 +667,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
 ; GFX900-MUBUF-NEXT:    ds_write_b16 v3, v4
 ; GFX900-MUBUF-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -726,12 +680,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
 ; GFX906-NEXT:    ds_write_b16 v3, v4
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 m0, -1
 ; GFX803-NEXT:    ds_read_u16 v0, v0
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
@@ -741,12 +694,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
 ; GFX803-NEXT:    ds_write_b16 v3, v4
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    ds_read_u16 v0, v0
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0xffff
 ; GFX900-FLATSCR-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
@@ -755,7 +707,7 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(ptr addrspace(3) noal
 ; GFX900-FLATSCR-NEXT:    ds_write_b16 v3, v4
 ; GFX900-FLATSCR-NEXT:    v_bfi_b32 v0, s0, v0, v1
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
@@ -770,27 +722,24 @@ entry:
 define void @load_global_lo_v2i16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -798,7 +747,7 @@ define void @load_global_lo_v2i16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -812,27 +761,24 @@ entry:
 define void @load_global_lo_v2f16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -840,7 +786,7 @@ define void @load_global_lo_v2f16_reglo_vreg(ptr addrspace(1) %in, i32 %reg) #0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -854,27 +800,24 @@ entry:
 define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -882,7 +825,7 @@ define void @load_global_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -897,27 +840,24 @@ entry:
 define void @load_global_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -925,7 +865,7 @@ define void @load_global_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -940,27 +880,24 @@ entry:
 define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -968,7 +905,7 @@ define void @load_global_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -984,27 +921,24 @@ entry:
 define void @load_global_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -1012,7 +946,7 @@ define void @load_global_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(1) %in, i32 %r
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1028,33 +962,30 @@ entry:
 define void @load_flat_lo_v2i16_reghi_vreg(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2i16_reghi_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2i16_reghi_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1067,33 +998,30 @@ entry:
 define void @load_flat_lo_v2f16_reghi_vreg(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reghi_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2f16_reghi_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2f16_reghi_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 
 ; FIXME: the and above should be removable
@@ -1108,33 +1036,30 @@ entry:
 define void @load_flat_lo_v2i16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1148,33 +1073,30 @@ entry:
 define void @load_flat_lo_v2i16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1188,33 +1110,30 @@ entry:
 define void @load_flat_lo_v2f16_reglo_vreg_zexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1229,33 +1148,30 @@ entry:
 define void @load_flat_lo_v2f16_reglo_vreg_sexti8(ptr %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1270,42 +1186,38 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg(ptr addrspace(5) byval(i16) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1319,46 +1231,42 @@ entry:
 define void @load_private_lo_v2i16_reghi_vreg(ptr addrspace(5) byval(i16) %in, i16 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX900-MUBUF-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ushort v1, off, s32 offset:4094
 ; GFX900-FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_perm_b32 v0, v0, v1, s0
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i64 2047
@@ -1372,42 +1280,38 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg(ptr addrspace(5) byval(half) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1421,43 +1325,39 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1470,43 +1370,39 @@ entry:
 define void @load_private_lo_v2i16_reghi_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1519,43 +1415,39 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg_nooff(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1568,42 +1460,38 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_zexti8(ptr addrspace(5) byval(i8) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1618,42 +1506,38 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_sexti8(ptr addrspace(5) byval(i8) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1668,43 +1552,39 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1718,43 +1598,39 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1768,43 +1644,39 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(ptr addrspace(5) %in, i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1819,27 +1691,24 @@ entry:
 define void @load_constant_lo_v2i16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2i16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_lo_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_lo_v2i16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -1847,7 +1716,7 @@ define void @load_constant_lo_v2i16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
@@ -1861,27 +1730,24 @@ entry:
 define void @load_constant_lo_v2f16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
@@ -1889,7 +1755,7 @@ define void @load_constant_lo_v2f16_reglo_vreg(ptr addrspace(4) %in, i32 %reg) #
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1903,27 +1769,24 @@ entry:
 define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -1931,7 +1794,7 @@ define void @load_constant_lo_v2f16_reglo_vreg_zexti8(ptr addrspace(4) %in, i32
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1947,27 +1810,24 @@ entry:
 define void @load_constant_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, i32 %reg) #0 {
 ; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
 ; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    global_store_dword v[0:1], v2, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
 ; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
@@ -1975,7 +1835,7 @@ define void @load_constant_lo_v2f16_reglo_vreg_sexti8(ptr addrspace(4) %in, i32
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
@@ -1991,7 +1851,6 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -1999,12 +1858,11 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, v1, s[0:3], s32 offen offset:4054 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2014,12 +1872,11 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2029,12 +1886,11 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    s_mov_b32 s4, 0x3020504
 ; GFX803-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2042,7 +1898,7 @@ define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s0 offset:4054 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -2059,7 +1915,6 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2067,12 +1922,11 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2082,12 +1936,11 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2097,12 +1950,11 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2110,7 +1962,7 @@ define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s0 offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -2128,7 +1980,6 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2136,12 +1987,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2151,12 +2001,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2166,12 +2015,11 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2179,7 +2027,7 @@ define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s0 offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -2197,7 +2045,6 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2205,12 +2052,11 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2220,12 +2066,11 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2235,12 +2080,11 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2248,7 +2092,7 @@ define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s0 offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -2267,7 +2111,6 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
 ; GFX900-MUBUF:       ; %bb.0: ; %entry
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -2275,12 +2118,11 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
 ; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -2290,12 +2132,11 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX906-NEXT:    global_store_dword v[0:1], v0, off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
 ; GFX803:       ; %bb.0: ; %entry
-; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
@@ -2305,12 +2146,11 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
-; GFX803-NEXT:    s_waitcnt vmcnt(0)
+; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
 ; GFX900-FLATSCR:       ; %bb.0: ; %entry
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -2318,7 +2158,7 @@ define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
 ; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s0 offset:4055 glc
 ; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
index 10dca76cc389aef..f5d840fbddd7a9f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll
@@ -8,42 +8,37 @@
 define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b128 v[0:3], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b128 v[0:3], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b64 v[2:3], v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b128 v[0:3], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr
   ret <4 x i32> %load
@@ -52,7 +47,6 @@ define <4 x i32> @load_lds_v4i32(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -69,31 +63,24 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v14, v0 offset:13
 ; GFX9-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX9-NEXT:    ds_read_u8 v16, v0 offset:15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(14)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v16, 8, v15
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
@@ -103,19 +90,15 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v4, v3, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:15
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:14
@@ -143,11 +126,11 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32_align1:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
@@ -164,21 +147,15 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_u8 v6, v6
 ; GFX6-NEXT:    ds_read_u8 v7, v7
 ; GFX6-NEXT:    ds_read_u8 v8, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 13, v0
@@ -217,11 +194,11 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -238,31 +215,24 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
 ; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
 ; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
@@ -279,27 +249,21 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v14, v0 offset:13
 ; GFX11-NEXT:    ds_load_u8 v15, v0 offset:14
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 1
   ret <4 x i32> %load
@@ -308,7 +272,6 @@ define <4 x i32> @load_lds_v4i32_align1(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16 v1, v0
 ; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -317,19 +280,16 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
@@ -339,24 +299,20 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:6
 ; GFX7-NEXT:    ds_read_u16 v7, v0 offset:10
 ; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32_align2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
@@ -373,26 +329,20 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_u16 v6, v6
 ; GFX6-NEXT:    ds_read_u16 v7, v7
 ; GFX6-NEXT:    ds_read_u16 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v1, v0
 ; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
@@ -401,19 +351,16 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
 ; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
 ; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v1, v0
 ; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
@@ -422,14 +369,12 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
 ; GFX11-NEXT:    ds_load_u16 v7, v0 offset:12
 ; GFX11-NEXT:    ds_load_u16 v8, v0 offset:14
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 2
   ret <4 x i32> %load
@@ -438,26 +383,23 @@ define <4 x i32> @load_lds_v4i32_align2(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32_align4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
@@ -466,25 +408,23 @@ define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_b32 v3, v3
 ; GFX6-NEXT:    ds_read_b32 v0, v0
 ; GFX6-NEXT:    ds_read_b32 v1, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 4
   ret <4 x i32> %load
@@ -493,42 +433,37 @@ define <4 x i32> @load_lds_v4i32_align4(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b64 v[2:3], v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_2addr_b64 v[0:3], v0 offset1:1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 8
   ret <4 x i32> %load
@@ -537,42 +472,37 @@ define <4 x i32> @load_lds_v4i32_align8(ptr addrspace(3) %ptr) {
 define <4 x i32> @load_lds_v4i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v4i32_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b128 v[0:3], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v4i32_align16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b128 v[0:3], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v4i32_align16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b64 v[2:3], v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v4i32_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b128 v[0:3], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v4i32_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b128 v[0:3], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <4 x i32>, ptr addrspace(3) %ptr, align 16
   ret <4 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index 2da3fce72072ee7..de42671fbf2f7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -8,42 +8,37 @@
 define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b96 v[0:2], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b96 v[0:2], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b96 v[0:2], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr
   ret <3 x i32> %load
@@ -52,7 +47,6 @@ define <3 x i32> @load_lds_v3i32(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u8 v1, v0
 ; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -65,26 +59,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
 ; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
-; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
 ; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
@@ -94,23 +83,19 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
 ; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
 ; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
 ; GFX7-NEXT:    ds_read_u8 v5, v0 offset:11
 ; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
 ; GFX7-NEXT:    ds_read_u8 v7, v0 offset:9
 ; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
 ; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -124,11 +109,11 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32_align1:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
@@ -145,21 +130,16 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_u8 v6, v6
 ; GFX6-NEXT:    ds_read_u8 v7, v7
 ; GFX6-NEXT:    ds_read_u8 v8, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    ds_read_u8 v4, v4
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
-; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
@@ -167,7 +147,6 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_u8 v5, v5
 ; GFX6-NEXT:    ds_read_u8 v6, v6
 ; GFX6-NEXT:    ds_read_u8 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
@@ -182,11 +161,11 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u8 v1, v0
 ; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
 ; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
@@ -199,26 +178,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
 ; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
 ; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
-; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u8 v1, v0
 ; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
 ; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
@@ -231,22 +205,18 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 ; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
 ; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
 ; GFX11-NEXT:    ds_load_u8 v0, v0 offset:11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
 ; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 1
   ret <3 x i32> %load
@@ -255,24 +225,21 @@ define <3 x i32> @load_lds_v3i32_align1(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_u16 v1, v0
 ; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
 ; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
@@ -280,21 +247,18 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX7-NEXT:    ds_read_u16 v4, v0
 ; GFX7-NEXT:    ds_read_u16 v5, v0 offset:6
 ; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
 ; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32_align2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
@@ -307,51 +271,44 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 ; GFX6-NEXT:    ds_read_u16 v4, v4
 ; GFX6-NEXT:    ds_read_u16 v5, v5
 ; GFX6-NEXT:    ds_read_u16 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_u16 v1, v0
 ; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
 ; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
 ; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
 ; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
 ; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
-; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_u16 v1, v0
 ; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
 ; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
 ; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
 ; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
 ; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
-; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 2
   ret <3 x i32> %load
@@ -360,51 +317,46 @@ define <3 x i32> @load_lds_v3i32_align2(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align4:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32_align4:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    ds_read_b32 v2, v2
 ; GFX6-NEXT:    ds_read_b32 v0, v0
 ; GFX6-NEXT:    ds_read_b32 v1, v1
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 4
   ret <3 x i32> %load
@@ -413,50 +365,45 @@ define <3 x i32> @load_lds_v3i32_align4(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align8:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32_align8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    ds_load_b64 v[0:1], v0
 ; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 8
   ret <3 x i32> %load
@@ -465,42 +412,37 @@ define <3 x i32> @load_lds_v3i32_align8(ptr addrspace(3) %ptr) {
 define <3 x i32> @load_lds_v3i32_align16(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: load_lds_v3i32_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b96 v[0:2], v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: load_lds_v3i32_align16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b96 v[0:2], v0
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: load_lds_v3i32_align16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
 ; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: load_lds_v3i32_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b96 v[0:2], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: load_lds_v3i32_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_load_b96 v[0:2], v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load = load <3 x i32>, ptr addrspace(3) %ptr, align 16
   ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
index 209f9516ba29a2e..e2e42f49af90be1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-assert.ll
@@ -4,9 +4,8 @@
 define <2 x i32> @range_metata_sext_range_0_i24_i64_bitcast(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metata_sext_range_0_i24_i64_bitcast:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !{}
   %shl = shl i64 %val, 22
@@ -18,11 +17,11 @@ define <2 x i32> @range_metata_sext_range_0_i24_i64_bitcast(ptr addrspace(1) %pt
 define <2 x i32> @no_range_sext_range_0_i24_i64_bitcast(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: no_range_sext_range_0_i24_i64_bitcast:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 22, v[0:1]
 ; GCN-NEXT:    v_ashrrev_i64 v[0:1], 22, v[0:1]
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i64, ptr addrspace(1) %ptr, align 4
   %shl = shl i64 %val, 22
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
index a4cf4d6ed2c8eca..6b87f4a87514904 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
@@ -4,9 +4,8 @@
 define i32 @range_metadata_sext_i8_signed_range_i32(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i8_signed_range_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !0, !noundef !{} ; [-127, 128)
   %shl = shl i32 %val, 24
@@ -17,10 +16,10 @@ define i32 @range_metadata_sext_i8_signed_range_i32(ptr addrspace(1) %ptr) {
 define i32 @range_metadata_sext_upper_range_limited_i32(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_upper_range_limited_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !1, !noundef !{} ; [-127, 256)
   %shl = shl i32 %val, 24
@@ -31,10 +30,10 @@ define i32 @range_metadata_sext_upper_range_limited_i32(ptr addrspace(1) %ptr) {
 define i32 @range_metadata_sext_lower_range_limited_i32(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_lower_range_limited_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !2 ; [-255, 128)
   %shl = shl i32 %val, 24
@@ -45,10 +44,10 @@ define i32 @range_metadata_sext_lower_range_limited_i32(ptr addrspace(1) %ptr) {
 define i32 @range_metadata_sext_i8_neg_neg_range_i32(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i8_neg_neg_range_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 63, v0
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !3, !noundef !{}
   %shl = shl i32 %val, 25
@@ -59,9 +58,8 @@ define i32 @range_metadata_sext_i8_neg_neg_range_i32(ptr addrspace(1) %ptr) {
 define i32 @range_metadata_sextload_i8_signed_range_i4_i32(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sextload_i8_signed_range_i4_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_sbyte v0, v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %load = load volatile i8, ptr addrspace(1) %ptr, align 1, !range !4, !noundef !{}
   %shl = shl i8 %load, 3
@@ -73,10 +71,10 @@ define i32 @range_metadata_sextload_i8_signed_range_i4_i32(ptr addrspace(1) %ptr
 define i25 @range_metadata_sext_i8_signed_range_i25(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i8_signed_range_i25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 2
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i25, ptr addrspace(1) %ptr, align 4, !range !5, !noundef !{}
   %shl = shl i25 %val, 23
@@ -87,9 +85,8 @@ define i25 @range_metadata_sext_i8_signed_range_i25(ptr addrspace(1) %ptr) {
 define i32 @range_metadata_i32_neg1_to_1(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_i32_neg1_to_1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dword v0, v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i32, ptr addrspace(1) %ptr, align 4, !range !6, !noundef !{}
   %shl = shl i32 %val, 31
@@ -100,11 +97,11 @@ define i32 @range_metadata_i32_neg1_to_1(ptr addrspace(1) %ptr) {
 define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i8_signed_range_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 23, v0
 ; GCN-NEXT:    v_ashrrev_i64 v[0:1], 55, v[0:1]
+; GCN-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !7, !noundef !{}
   %shl = shl i64 %val, 55
@@ -115,9 +112,8 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) {
 define i64 @range_metadata_sext_i32_signed_range_i64(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i32_signed_range_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !7, !noundef !{}
   %shl = shl i64 %val, 31
@@ -128,9 +124,8 @@ define i64 @range_metadata_sext_i32_signed_range_i64(ptr addrspace(1) %ptr) {
 define i64 @range_metadata_sext_i33_signed_range_i64(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: range_metadata_sext_i33_signed_range_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile i64, ptr addrspace(1) %ptr, align 4, !range !8, !noundef !{}
   %shl = shl i64 %val, 30
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 38e79350c0e181a..1f95bfe3085d219 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -7,26 +7,23 @@
 define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
 ; VI-LABEL: lds_atomic_fadd_ret_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 4.0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_ret_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_rtn_f32 v0, v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -40,16 +37,17 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fadd_ret_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v1, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -63,11 +61,13 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret float %result
@@ -76,26 +76,23 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
 define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; VI-LABEL: lds_atomic_fadd_noret_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 4.0
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_add_f32 v0, v1
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_noret_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4.0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    ds_add_f32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v1, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -109,15 +106,16 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fadd_noret_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v1, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -131,10 +129,12 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v2
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, float 4.0 seq_cst
   ret void
@@ -871,7 +871,6 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-LABEL: lds_atomic_fadd_ret_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    v_mov_b32_e32 v2, v0
 ; VI-NEXT:    ds_read_b64 v[0:1], v0
@@ -887,15 +886,16 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB4_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_ret_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
@@ -910,15 +910,16 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    ds_read_b64 v[0:1], v0
@@ -934,15 +935,16 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fadd_ret_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    ds_read_b64 v[0:1], v0
@@ -958,10 +960,12 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret double %result
@@ -970,7 +974,6 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-LABEL: lds_atomic_fadd_noret_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_read_b64 v[1:2], v0
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
@@ -985,15 +988,16 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_mov_b32_e32 v1, v3
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v2, v4
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB5_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fadd_noret_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b64 v[1:2], v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
@@ -1007,15 +1011,16 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b64 v[1:2], v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1030,15 +1035,16 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fadd_noret_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b64 v[1:2], v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -1053,10 +1059,12 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
   ret void
@@ -1065,7 +1073,6 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
 ; VI-LABEL: lds_atomic_fsub_ret_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_read_b32 v2, v0
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
@@ -1079,16 +1086,17 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB6_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fsub_ret_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v2, v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
@@ -1101,16 +1109,17 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_ret_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v2, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1124,16 +1133,17 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fsub_ret_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -1147,11 +1157,13 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret float %result
@@ -1160,7 +1172,6 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwind {
 ; VI-LABEL: lds_atomic_fsub_noret_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_read_b32 v2, v0
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
@@ -1174,15 +1185,16 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB7_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fsub_noret_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v2, v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
@@ -1195,15 +1207,16 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_noret_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b32 v2, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1217,15 +1230,16 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fsub_noret_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b32 v2, v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -1239,10 +1253,12 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v3
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, float %val seq_cst
   ret void
@@ -1251,7 +1267,6 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
 ; VI-LABEL: lds_atomic_fsub_ret_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_read_b64 v[3:4], v0
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
@@ -1266,17 +1281,18 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB8_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
 ; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fsub_ret_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
@@ -1290,17 +1306,18 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_ret_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1315,17 +1332,18 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fsub_ret_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -1340,12 +1358,14 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret double %result
@@ -1354,7 +1374,6 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounwind {
 ; VI-LABEL: lds_atomic_fsub_noret_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_read_b64 v[3:4], v0
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
@@ -1369,15 +1388,16 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; VI-NEXT:    v_mov_b32_e32 v3, v5
 ; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:    s_waitcnt expcnt(0)
 ; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_cbranch_execnz .LBB9_1
 ; VI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lds_atomic_fsub_noret_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -1391,15 +1411,16 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_noret_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -1414,15 +1435,16 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v6
+; GFX7-NEXT:    s_waitcnt expcnt(0)
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: lds_atomic_fsub_noret_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_read_b64 v[3:4], v0
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
@@ -1437,10 +1459,12 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v6
+; GFX8-NEXT:    s_waitcnt expcnt(0)
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(3) %ptr, double %val seq_cst
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 52f97150e4b301b..e7e97d2848639a0 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -106,7 +106,6 @@ entry:
 define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-LABEL: func_local_stack_offset_uses_sp:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s5, s33
 ; MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
@@ -144,12 +143,11 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
 ; MUBUF-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s2, s33
 ; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
 ; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
@@ -182,7 +180,7 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
index 7738ec689dc7ebb..f50b83e77628549 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll
@@ -8,7 +8,6 @@
 define <3 x float> @liveout_undef_subrange(<3 x float> %arg) {
 ; CHECK-LABEL: liveout_undef_subrange:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_f32_e32 v3, v2, v2
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    ; kill: killed $vgpr1
@@ -17,6 +16,7 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v2
 ; CHECK-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb2
@@ -26,6 +26,7 @@ define <3 x float> @liveout_undef_subrange(<3 x float> %arg) {
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.3: ; %DummyReturnBlock
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
index 546022b4f9c43dd..6e6c35e050c5527 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll
@@ -18,7 +18,6 @@ define void @loop_on_argument(i1 %arg) {
 ;
 ; CHECK-LABEL: loop_on_argument:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
@@ -28,11 +27,12 @@ define void @loop_on_argument(i1 %arg) {
 ; CHECK-NEXT:    s_and_b64 s[6:7], exec, vcc
 ; CHECK-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; CHECK-NEXT:    global_store_dword v[0:1], v0, off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
index a407cd20bf76244..752b4411d0105b9 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -9,7 +9,6 @@
 define void @needs_and(i32 %arg) {
 ; GCN-LABEL: needs_and:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s10, 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_branch .LBB0_2
@@ -25,16 +24,17 @@ define void @needs_and(i32 %arg) {
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], s10, v0
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
 ; GCN-NEXT:  ; %bb.3: ; %then
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
 ; GCN-NEXT:    s_branch .LBB0_1
 ; GCN-NEXT:  .LBB0_4: ; %loopexit
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
@@ -63,7 +63,6 @@ loopexit:
 define void @doesnt_need_and(i32 %arg) {
 ; GCN-LABEL: doesnt_need_and:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB1_1: ; %loop
@@ -72,11 +71,12 @@ define void @doesnt_need_and(i32 %arg) {
 ; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
 ; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_1
 ; GCN-NEXT:  ; %bb.2: ; %loopexit
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
@@ -98,7 +98,6 @@ loopexit:
 define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 ; GCN-LABEL: break_cond_is_arg:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
@@ -116,6 +115,7 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 ; GCN-NEXT:  .LBB2_2: ; %loop
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, s10, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.3: ; %then
@@ -124,7 +124,7 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_4: ; %loopexit
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index b8e74bc7db09a1a..38e67a57a2a6d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -7,7 +7,6 @@
 define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49280.not, i32 %arg1, i1 %cmp108) {
 ; CHECK-LABEL: machinesink_loop_variable_out_of_divergent_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v1, 1, v1
 ; CHECK-NEXT:    v_and_b32_e32 v3, 1, v3
 ; CHECK-NEXT:    s_mov_b32 s5, 0
@@ -37,6 +36,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    ; Child Loop BB0_6 Depth 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.4: ; %for.body51.preheader
@@ -78,7 +78,7 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_9
 ; CHECK-NEXT:  ; %bb.10: ; %DummyReturnBlock
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   br label %for.body33
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index b5442b40c2044e0..bf3c6a0965dcf62 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -12,42 +12,42 @@
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -61,50 +61,50 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, 0x3c00, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -118,48 +118,48 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -174,63 +174,63 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
 ; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -246,57 +246,56 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
 ; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GISEL-GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 16
 ; GISEL-GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -304,11 +303,11 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 16
 ; GISEL-VI-NEXT:    v_lshlrev_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -316,6 +315,7 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; GISEL-CI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -331,46 +331,46 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -386,37 +386,36 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v0 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -430,6 +429,7 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -445,25 +445,24 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
 ; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX11-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX11-NEXT:    global_store_b16 v[0:1], v3, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX9-NEXT:    global_store_short v[0:1], v3, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -472,23 +471,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    s_mov_b32 s7, 0xf000
 ; SDAG-CI-NEXT:    s_mov_b32 s6, -1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v0 clamp
 ; SDAG-CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -506,6 +504,7 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 00ea05b0dd24c44..b257dcba7cb7262 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -14,42 +14,42 @@
 define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
 ; GFX1100-LABEL: mixlo_simple:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_simple:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_simple:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_simple:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_simple:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_simple:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
   %cvt.result = fptrunc float %result to half
@@ -59,44 +59,44 @@ define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
 define half @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
 ; GFX1100-LABEL: mixlo_simpl_no_flush:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_simpl_no_flush:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_simpl_no_flush:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_simpl_no_flush:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_simpl_no_flush:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_simpl_no_flush:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
   %cvt.result = fptrunc float %result to half
@@ -106,48 +106,48 @@ define half @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
 define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -160,53 +160,53 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src
 define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1, half %src2) {
 ; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -219,46 +219,46 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1,
 define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -270,42 +270,41 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
 define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e64 v0, v2 clamp
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
@@ -318,6 +317,7 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -331,50 +331,50 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
 define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
 ; GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -391,32 +391,31 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src
 define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v2f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v2f32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_v2f32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -428,11 +427,11 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -451,11 +450,11 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v4, v1
@@ -467,11 +466,11 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -482,6 +481,7 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v4
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -494,35 +494,34 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v3f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v6
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v3f32:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v3f32:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v3f32:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -539,11 +538,11 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v3f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -571,29 +570,29 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v3f32:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v3f32:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v3f32:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v2
@@ -610,11 +609,11 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v3f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -630,6 +629,7 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v6
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v7
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v8
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
   %src1.ext = fpext <3 x half> %src1 to <3 x float>
@@ -642,7 +642,6 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v4f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -650,33 +649,33 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v4f32:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v7
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v6
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v4f32:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v7
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v6
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v4f32:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
@@ -699,11 +698,11 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v3, v5
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v4f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
@@ -740,33 +739,33 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v4f32:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v6
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v1, v7
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v4f32:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v6
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v1, v7
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
@@ -789,11 +788,11 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-VI-NEXT:    v_or_b32_e32 v1, v2, v3
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v4f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -814,6 +813,7 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v9
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v10
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v11
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -828,32 +828,31 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -865,11 +864,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v1, v2 clamp
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -888,11 +887,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v4, v1
@@ -904,11 +903,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e64 v0, v5 clamp
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v1, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -933,6 +932,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -950,7 +950,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -959,33 +958,33 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v1, v1, 0
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v1, v1, 0
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1002,11 +1001,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v1, v5 clamp
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -1034,11 +1033,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1046,31 +1045,31 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; GISEL-GFX1100-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v2
@@ -1087,11 +1086,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e64 v1, v5 clamp
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1127,6 +1126,7 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
   %src1.ext = fpext <3 x half> %src1 to <3 x float>
@@ -1141,7 +1141,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1149,33 +1148,33 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX906-NEXT:    v_mov_b32_e32 v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -1198,11 +1197,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v3, v5 clamp
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
@@ -1239,11 +1238,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
@@ -1266,11 +1265,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v3, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-VI-NEXT:    v_or_b32_e32 v1, v2, v3
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1317,6 +1316,7 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -1334,32 +1334,31 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1371,11 +1370,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v1, v2 clamp
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -1394,11 +1393,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, v0 clamp
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v4, v3
@@ -1407,31 +1406,31 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GISEL-GFX1100-NEXT:    v_and_or_b32 v0, 0xffff0000, v4, v0
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_max_f16_e64 v4, v3, v3 clamp
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, 0xffff0000
 ; GISEL-GFX900-NEXT:    v_and_or_b32 v0, v3, v0, v4
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_max_f16_e64 v4, v3, v3 clamp
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, 0xffff0000
 ; GISEL-GFX906-NEXT:    v_and_or_b32 v0, v3, v0, v4
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v4, v1
@@ -1446,11 +1445,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; GISEL-VI-NEXT:    v_max_f16_e64 v1, v0, v0 clamp
 ; GISEL-VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -1474,6 +1473,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1490,32 +1490,31 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1527,11 +1526,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1550,11 +1549,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v5
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1563,11 +1562,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GISEL-GFX1100-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
@@ -1575,11 +1574,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GISEL-GFX900-NEXT:    v_and_or_b32 v0, v3, v1, v0
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
@@ -1587,11 +1586,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GISEL-GFX906-NEXT:    v_and_or_b32 v0, v3, v1, v0
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v4, v1
@@ -1605,11 +1604,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-VI-NEXT:    v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GISEL-VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
@@ -1635,6 +1634,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-CI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1653,7 +1653,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1661,31 +1660,31 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-GFX1100-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1697,11 +1696,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -1720,11 +1719,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -1732,31 +1731,31 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; GISEL-GFX1100-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v4, v1
@@ -1768,11 +1767,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e32 v1, v3
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1783,6 +1782,7 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v3, v5 clamp
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1799,7 +1799,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
@@ -1809,11 +1808,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v6
@@ -1821,11 +1820,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v2
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v6
@@ -1833,11 +1832,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v2
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -1854,11 +1853,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-VI-NEXT:    v_mad_f32 v1, v1, v3, v5 clamp
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -1886,11 +1885,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
@@ -1900,11 +1899,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v6
@@ -1912,11 +1911,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v6
@@ -1924,11 +1923,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v2
@@ -1945,11 +1944,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-VI-NEXT:    v_mad_f32 v1, v1, v3, v5 clamp
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1965,6 +1964,7 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
   %src1.ext = fpext <3 x half> %src1 to <3 x float>
@@ -1979,7 +1979,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
@@ -1993,11 +1992,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
@@ -2008,11 +2007,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
@@ -2023,11 +2022,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -2050,11 +2049,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SDAG-VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
@@ -2091,11 +2090,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
@@ -2109,11 +2108,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
@@ -2124,11 +2123,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GISEL-GFX900-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
@@ -2139,11 +2138,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GISEL-GFX906-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v7, v1
@@ -2166,11 +2165,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-VI-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GISEL-VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -2191,6 +2190,7 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <4 x half> %src0 to <4 x float>
   %src1.ext = fpext <4 x half> %src1 to <4 x float>
@@ -2205,45 +2205,45 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
 ; GFX1100-LABEL: mixlo_zext:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_zext:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, v2
 ; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_zext:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_zext:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_zext:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_zext:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v2, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
   %cvt.result = fptrunc float %result to half
@@ -2255,42 +2255,42 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
 define half @mixlo_fptrunc(float %a, float %b) #0 {
 ; GFX1100-LABEL: mixlo_fptrunc:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, 0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_fptrunc:
 ; GFX900:       ; %bb.0: ; %.entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, v0, v1, 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_fptrunc:
 ; GFX906:       ; %bb.0: ; %.entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_fptrunc:
 ; VI:       ; %bb.0: ; %.entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_fptrunc:
 ; SDAG-CI:       ; %bb.0: ; %.entry
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_fptrunc:
 ; GISEL-CI:       ; %bb.0: ; %.entry
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul = fmul float %a, %b
@@ -2301,43 +2301,43 @@ define half @mixlo_fptrunc(float %a, float %b) #0 {
 define half @mixlo_fptrunc_no_flush(float %a, float %b) {
 ; GFX1100-LABEL: mixlo_fptrunc_no_flush:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, v0, v1, 0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_fptrunc_no_flush:
 ; GFX900:       ; %bb.0: ; %.entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_fptrunc_no_flush:
 ; GFX906:       ; %bb.0: ; %.entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_fptrunc_no_flush:
 ; VI:       ; %bb.0: ; %.entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_fptrunc_no_flush:
 ; SDAG-CI:       ; %bb.0: ; %.entry
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_fptrunc_no_flush:
 ; GISEL-CI:       ; %bb.0: ; %.entry
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %mul = fmul float %a, %b
@@ -2348,42 +2348,42 @@ define half @mixlo_fptrunc_no_flush(float %a, float %b) {
 define half @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
 ; GFX1100-LABEL: mixlo_fptrunc_abs_src_mod:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, |v0|, v1, 0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_fptrunc_abs_src_mod:
 ; GFX900:       ; %bb.0: ; %.entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, |v0|, v1, 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_fptrunc_abs_src_mod:
 ; GFX906:       ; %bb.0: ; %.entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, |v0|, v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_fptrunc_abs_src_mod:
 ; VI:       ; %bb.0: ; %.entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_fptrunc_abs_src_mod:
 ; SDAG-CI:       ; %bb.0: ; %.entry
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_fptrunc_abs_src_mod:
 ; GISEL-CI:       ; %bb.0: ; %.entry
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a.fabs = call float @llvm.fabs.f32(float %a)
@@ -2395,42 +2395,42 @@ define half @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
 define half @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 {
 ; GFX1100-LABEL: mixlo_fptrunc_neg_src_mod:
 ; GFX1100:       ; %bb.0: ; %.entry
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mixlo_f16 v0, -v0, v1, 0
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: mixlo_fptrunc_neg_src_mod:
 ; GFX900:       ; %bb.0: ; %.entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mixlo_f16 v0, -v0, v1, 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: mixlo_fptrunc_neg_src_mod:
 ; GFX906:       ; %bb.0: ; %.entry
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, -v0, v1, 0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mixlo_fptrunc_neg_src_mod:
 ; VI:       ; %bb.0: ; %.entry
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: mixlo_fptrunc_neg_src_mod:
 ; SDAG-CI:       ; %bb.0: ; %.entry
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: mixlo_fptrunc_neg_src_mod:
 ; GISEL-CI:       ; %bb.0: ; %.entry
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
 .entry:
   %a.fneg = fneg float %a
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index 2f749900214ee52..1c3ed897d100686 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -14,44 +14,44 @@
 define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -63,34 +63,33 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -98,6 +97,7 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.hi = lshr i32 %src0, 16
   %src1.hi = lshr i32 %src1, 16
@@ -118,44 +118,44 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v3, v5
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v5
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.hi = extractelement <2 x half> %src0, i32 1
   %src1.hi = extractelement <2 x half> %src1, i32 1
@@ -170,32 +170,31 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v2f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -204,11 +203,11 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v1, v3, v5
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v0, v4, v6
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v6, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -224,27 +223,27 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v3, v1, v5
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v1, v3
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v0, v4, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v5, v1
@@ -253,11 +252,11 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v5
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v1, v4, v6
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -266,6 +265,7 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v6, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v7, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -277,32 +277,31 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v2f32_shuffle:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
 ; GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_v2f32_shuffle:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_v2f32_shuffle:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
@@ -311,11 +310,11 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; VI-NEXT:    v_mad_f32 v0, v3, v0, v2
 ; VI-NEXT:    v_mac_f32_e32 v2, v4, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -328,11 +327,11 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v4, v2, v1
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v5, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GISEL-CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-CI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -348,6 +347,7 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v4, v0, v1
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v5, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> <i32 0, i32 1>
@@ -362,53 +362,53 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e64 v3, -v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e64 v3, -v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -421,44 +421,44 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -471,44 +471,44 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s
 define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mad_f32 v0, -|v0|, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -|v0|, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mad_f32 v0, -|v0|, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -522,42 +522,42 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half
 define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -568,42 +568,42 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2)
 define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, -v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -615,42 +615,42 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr
 define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, |v2|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, |v2|
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, |v2|
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -662,42 +662,42 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr
 define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, -|v2|
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, -|v2|
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, -|v2|
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -714,68 +714,68 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
 define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 1.0
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 1.0
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 1.0
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, 1.0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, 1.0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -786,69 +786,69 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
 define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0.15915494
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0.15915494
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0.15915494
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, 0.15915494
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_madak_f32 v0, v0, v1, 0x3e22f983
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v2, 0.15915494
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v2, 0.15915494
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v2, 0.15915494
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v0, 0x3e22f983
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -865,78 +865,78 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0x3e230000
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0x3e230000
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0x3e230000
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_madak_f32 v0, v0, v1, 0x3e230000
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_madak_f32 v0, v0, v1, 0x3e230000
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v2, 0x3e230000
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v2, 0x3e230000
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v2, 0x3e230000
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x3e230000
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v0, 0x3e230000
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -949,78 +949,78 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0x367c0000
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0x367c0000
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0x367c0000
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_madak_f32 v0, v0, v1, 0x367c0000
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_madak_f32 v0, v0, v1, 0x367c0000
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v2, 0x367c0000
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v2, 0x367c0000
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v2, 0x367c0000
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x367c0000
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v0, 0x367c0000
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v2, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1032,46 +1032,45 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 1.0
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 1.0
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 1.0
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_mad_f32 v0, v0, v3, 1.0
 ; SDAG-VI-NEXT:    v_mad_f32 v1, v2, v1, 1.0
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1082,56 +1081,57 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v2, 1.0
 ; SDAG-CI-NEXT:    v_mad_f32 v1, v1, v3, 1.0
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 1.0
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_mad_f32 v0, v2, v0, 1.0
 ; GISEL-VI-NEXT:    v_mad_f32 v1, v3, v1, 1.0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v2, 1.0
 ; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v3, 1.0
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1142,35 +1142,34 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0x3e230000
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0x3e230000
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0x3e230000
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -1178,11 +1177,11 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
 ; SDAG-VI-NEXT:    v_madak_f32 v0, v0, v3, 0x3e230000
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v1, v2, v4
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1194,39 +1193,39 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
 ; SDAG-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e230000
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 0x3e230000
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 0x3e230000
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 0x3e230000
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
@@ -1234,11 +1233,11 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
 ; GISEL-VI-NEXT:    v_madak_f32 v0, v2, v0, 0x3e230000
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v1, v3, v4
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1246,6 +1245,7 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
 ; GISEL-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e230000
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1257,46 +1257,45 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
 ; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0.15915494
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0.15915494
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0.15915494
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
 ; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_mad_f32 v0, v0, v3, 0.15915494
 ; SDAG-VI-NEXT:    v_mad_f32 v1, v2, v1, 0.15915494
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SDAG-CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -1308,50 +1307,50 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v1, 0x3e22f983
 ; SDAG-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e22f983
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-GFX1100:       ; %bb.0:
-; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 0.15915494
 ; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 0.15915494
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 0.15915494
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_mad_f32 v0, v2, v0, 0.15915494
 ; GISEL-VI-NEXT:    v_mad_f32 v1, v3, v1, 0.15915494
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -1359,6 +1358,7 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; GISEL-CI-NEXT:    v_mov_b32_e32 v1, 0x3e22f983
 ; GISEL-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e22f983
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1370,44 +1370,44 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v3, v5 clamp
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v5
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, v2 clamp
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.hi = extractelement <2 x half> %src0, i32 1
   %src1.hi = extractelement <2 x half> %src1, i32 1
@@ -1424,32 +1424,32 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
 define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
 ; GFX1100-LABEL: no_mix_simple:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: no_mix_simple:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: no_mix_simple:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: no_mix_simple:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: no_mix_simple:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
   ret float %result
@@ -1458,32 +1458,32 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
 define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
 ; GFX1100-LABEL: no_mix_simple_fabs:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: no_mix_simple_fabs:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: no_mix_simple_fabs:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_f32 v0, |v0|, v1, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: no_mix_simple_fabs:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-LABEL: no_mix_simple_fabs:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.fabs = call float @llvm.fabs.f32(float %src0)
   %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2)
@@ -1497,48 +1497,48 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1550,45 +1550,45 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr
 define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1599,60 +1599,60 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl
 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX906-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX906-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX906-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SDAG-CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1665,55 +1665,55 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
 define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX1100-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX1100-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX900-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX906-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX906-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX906-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SDAG-CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GISEL-CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1725,44 +1725,44 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1775,42 +1775,42 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal
 define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext half %src0 to float
   %src1.ext = fpext half %src1 to float
@@ -1822,54 +1822,54 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src
 define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-CI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e64 v3, -v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e64 v3, -v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %src0 = extractelement <2 x half> %src0.arg.bc, i32 0
@@ -1887,54 +1887,54 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
 define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_mov_b32 s4, 0x8000
 ; GFX900-NEXT:    v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s4, 0x8000
 ; GFX906-NEXT:    v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; VI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_mad_f32 v0, |v0|, v1, v2
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %src0 = extractelement <2 x half> %src0.arg.bc, i32 1
@@ -1950,47 +1950,47 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
 define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e64 v3, |v0|
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %src0 = extractelement <2 x half> %src0.arg.bc, i32 1
@@ -2005,58 +2005,58 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
 define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %fneg = fneg <2 x half> %src0.arg.bc
@@ -2071,58 +2071,58 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc)
@@ -2137,58 +2137,58 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
 ; GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_mad_f32 v0, -v0, v1, v2
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; SDAG-CI:       ; %bb.0:
-; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-VI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
-; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GISEL-CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; GISEL-CI-NEXT:    v_mac_f32_e32 v0, v3, v1
+; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.arg.bc = bitcast i32 %src0.arg to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc)
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 1f72995b6eafdc7..278ba5f57ea0ecc 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -105,26 +105,26 @@ entry:
 define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX8-LABEL: v_mad_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_u16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mad_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mad_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
@@ -134,29 +134,29 @@ define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
 define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX8-LABEL: v_mad_u16_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_u16_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mad_u16_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mad_u16_zext:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
@@ -167,32 +167,32 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
 define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX8-LABEL: v_mad_u16_zext64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_u16_zext64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mad_u16_zext64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mad_u16_zext64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 3a337ec7a8733d5..1d52a8f003e8c31 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -9,31 +9,31 @@
 define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_sextops:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_sextops:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -45,31 +45,31 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_sextops_commute:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_sextops_commute:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_commute:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -81,31 +81,31 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_u64_u32_zextops:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_u64_u32_zextops:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_u64_u32_zextops:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_u64_u32_zextops:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = zext i32 %arg0 to i64
   %sext1 = zext i32 %arg1 to i64
@@ -117,31 +117,31 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_u64_u32_zextops_commute:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_u64_u32_zextops_commute:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_u64_u32_zextops_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_u64_u32_zextops_commute:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = zext i32 %arg0 to i64
   %sext1 = zext i32 %arg1 to i64
@@ -153,7 +153,6 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_sextops_i32_i128:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
 ; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
 ; CI-NEXT:    v_mov_b32_e32 v8, 0
@@ -174,11 +173,11 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; CI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_sextops_i32_i128:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
 ; SI-NEXT:    v_mul_lo_u32 v11, v6, v1
 ; SI-NEXT:    v_mul_hi_u32 v12, v0, v1
@@ -207,11 +206,11 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v12, v3, vcc
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
@@ -234,11 +233,11 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
@@ -266,6 +265,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i128
   %sext1 = sext i32 %arg1 to i128
@@ -277,31 +277,31 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
 define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_sextops_i32_i63:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_sextops_i32_i63:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_i32_i63:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_i32_i63:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i63
   %sext1 = sext i32 %arg1 to i63
@@ -313,15 +313,14 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
 define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_sextops_i31_i63:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_bfe_i32 v1, v1, 0, 31
 ; CI-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_sextops_i31_i63:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; SI-NEXT:    v_ashr_i64 v[4:5], v[3:4], 33
@@ -330,23 +329,24 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
 ; SI-NEXT:    v_mul_hi_i32 v4, v4, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v4, v3, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 31
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_sextops_i31_i63:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v4, v1, 0, 31
 ; GFX11-NEXT:    v_bfe_i32 v5, v0, 0, 31
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i31 %arg0 to i63
   %sext1 = sext i31 %arg1 to i63
@@ -358,16 +358,15 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
 define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_extops_i32_i64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CI-NEXT:    v_mul_lo_u32 v4, v4, v1
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_extops_i32_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; SI-NEXT:    v_mul_hi_u32 v5, v0, v1
 ; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
@@ -375,22 +374,22 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v5, v4
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_extops_i32_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
@@ -398,6 +397,7 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext0 = sext i32 %arg0 to i64
   %ext1 = zext i32 %arg1 to i64
@@ -409,31 +409,31 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_u64_u32_bitops:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_u64_u32_bitops:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
 ; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_u64_u32_bitops:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_u64_u32_bitops:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc.lhs = and i64 %arg0, 4294967295
   %trunc.rhs = and i64 %arg1, 4294967295
@@ -445,16 +445,15 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_and_b32_e32 v3, 1, v1
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
 ; CI-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v1
 ; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
@@ -462,21 +461,21 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_u64_u32_bitops_lhs_mask_small:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -484,6 +483,7 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc.lhs = and i64 %arg0, 8589934591
   %trunc.rhs = and i64 %arg1, 4294967295
@@ -495,17 +495,16 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v6, v0
 ; CI-NEXT:    v_and_b32_e32 v3, 1, v3
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
 ; CI-NEXT:    v_mul_lo_u32 v2, v6, v3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v3
 ; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; SI-NEXT:    v_mul_lo_u32 v1, v0, v1
@@ -513,28 +512,29 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_u64_u32_bitops_rhs_mask_small:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %trunc.lhs = and i64 %arg0, 4294967295
   %trunc.rhs = and i64 %arg1, 8589934591
@@ -546,31 +546,31 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #
 define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_bitops:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_bitops:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
 ; SI-NEXT:    v_mul_hi_i32 v2, v0, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_bitops:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_bitops:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl.lhs = shl i64 %arg0, 32
   %trunc.lhs = ashr i64 %shl.lhs, 32
@@ -585,31 +585,31 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
 define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
 ; CI-LABEL: mad_i64_i32_unpack_i64ops:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_unpack_i64ops:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v2, v1, v0
 ; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_unpack_i64ops:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_unpack_i64ops:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp4 = lshr i64 %arg0, 32
   %tmp5 = and i64 %arg0, 4294967295
@@ -695,16 +695,15 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
 ; CI-LABEL: mad_i64_i32_twice:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
 ; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_twice:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v6, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
@@ -713,25 +712,26 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
 ; SI-NEXT:    v_addc_u32_e32 v0, vcc, v0, v5, vcc
 ; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, v2, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_twice:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_twice:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
 ; GFX11-NEXT:    v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v6, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v7, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -745,7 +745,6 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
 define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 {
 ; CI-LABEL: mad_i64_i32_thrice:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
@@ -757,11 +756,11 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 ; CI-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_thrice:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v8, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v2
@@ -774,11 +773,11 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 ; SI-NEXT:    v_xor_b32_e32 v3, v1, v3
 ; SI-NEXT:    v_xor_b32_e32 v1, v2, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, v3, v5
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_thrice:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
 ; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5]
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7]
@@ -786,11 +785,11 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 ; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_thrice:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_i64_i32 v[8:9], null, v0, v1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v2
@@ -805,6 +804,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -820,37 +820,36 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %
 define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; CI-LABEL: mad_i64_i32_secondary_use:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
 ; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i64_i32_secondary_use:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
 ; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v0, v3, vcc
 ; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; SI-NEXT:    v_xor_b32_e32 v0, v2, v4
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i64_i32_secondary_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0
 ; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i64_i32_secondary_use:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mad_i64_i32 v[4:5], null, v0, v1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
@@ -858,6 +857,7 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -870,7 +870,6 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
 ; CI-LABEL: mad_i48_i48:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v6, v1
 ; CI-NEXT:    v_mov_b32_e32 v7, v0
 ; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
@@ -878,11 +877,11 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
 ; CI-NEXT:    v_mul_lo_u32 v3, v7, v3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: mad_i48_i48:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; SI-NEXT:    v_mul_hi_u32 v6, v0, v2
 ; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
@@ -891,22 +890,22 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mad_i48_i48:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v2
 ; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mad_i48_i48:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
@@ -914,6 +913,7 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
 ; GFX11-NEXT:    v_mul_lo_u32 v2, v6, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %m = mul i48 %arg0, %arg1
   %a = add i48 %m, %arg2
diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
index e64bc0dc374da67..1a65dadd108e93c 100644
--- a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
@@ -13,23 +13,21 @@ declare i32 @llvm.umax.i32(i32, i32)
 define i32 @v_known_bits_umed3(i8 %a) {
 ; SI-SDAG-LABEL: v_known_bits_umed3:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v1, 0x80
 ; SI-SDAG-NEXT:    v_med3_u32 v0, v0, 32, v1
 ; SI-SDAG-NEXT:    s_mov_b32 m0, -1
 ; SI-SDAG-NEXT:    ds_read_u8 v0, v0 offset:128
-; SI-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_known_bits_umed3:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x80
 ; SI-GISEL-NEXT:    v_med3_u32 v0, v0, 32, v1
 ; SI-GISEL-NEXT:    ds_read_u8 v0, v0 offset:128
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ext.a = zext i8 %a to i32
   %max.a = call i32 @llvm.umax.i32(i32 %ext.a, i32 32)
@@ -48,7 +46,6 @@ define i32 @v_known_bits_umed3(i8 %a) {
 define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
 ; SI-SDAG-LABEL: v_known_signbits_smed3:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; SI-SDAG-NEXT:    s_movk_i32 s4, 0xffc0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0x80
@@ -70,11 +67,11 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
 ; SI-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; SI-SDAG-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; SI-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_known_signbits_smed3:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffffffc0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x80
@@ -110,6 +107,7 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v1, v4, v2
 ; SI-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ext.a = sext i16 %a to i32
   %max.a = call i32 @llvm.smax.i32(i32 %ext.a, i32 -32)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 615039500962da9..351ea7e18cbc3c9 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -6,7 +6,6 @@
 define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-LABEL: issue63986:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_lshlrev_b64 v[4:5], 6, v[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
@@ -115,6 +114,7 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ; Child Loop BB0_14 Depth 2
 ; CHECK-NEXT:    ; Child Loop BB0_18 Depth 2
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_15
 ; CHECK-NEXT:  ; %bb.13: ; %loop-memcpy-expansion2.preheader
@@ -222,8 +222,8 @@ while.cond:                         ; preds = %while.cond
 define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-LABEL: issue63986_reduced_expanded:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:  ; %bb.1: ; %loop-memcpy-expansion.preheader
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  ; %bb.2: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_and_b32 s4, 32, 15
@@ -271,7 +271,7 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_9
 ; CHECK-NEXT:  ; %bb.10: ; %DummyReturnBlock
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %spec.select = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index a5e0ceaa6b329b2..3a3910ccc4f4085 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -6,7 +6,6 @@
 define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; MUBUF-LABEL: memcpy_fixed_align:
 ; MUBUF:       ; %bb.0:
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
 ; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
@@ -27,12 +26,11 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use v0
 ; MUBUF-NEXT:    ;;#ASMEND
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: memcpy_fixed_align:
 ; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
 ; FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
@@ -46,7 +44,7 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use v0
 ; FLATSCR-NEXT:    ;;#ASMEND
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [40 x i8], addrspace(5)
   call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 dereferenceable(40) %alloca, ptr addrspace(1) align 4 dereferenceable(40) %src, i64 40, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 6f419ab2cc67eba..969625b3c8daa2d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -145,7 +145,6 @@ bb:
 define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr addrspace(5) noalias nocapture %arg1) {
 ; GCN-LABEL: mubuf_clause:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff0, v2
 ; GCN-NEXT:    v_add_u32_e32 v0, v0, v2
@@ -199,12 +198,11 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
 ; GCN-NEXT:    buffer_store_dword v17, v1, s[0:3], 0 offen offset:52
 ; GCN-NEXT:    s_waitcnt vmcnt(15)
 ; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-SCRATCH-LABEL: mubuf_clause:
 ; GCN-SCRATCH:       ; %bb.0: ; %bb
-; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
 ; GCN-SCRATCH-NEXT:    v_and_b32_e32 v18, 0x3ff0, v2
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v0, v18
@@ -228,6 +226,7 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
 ; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v18, v[10:13], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v19, v[14:17], off
+; GCN-SCRATCH-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -305,7 +304,6 @@ bb:
 define void @load_global_d16_hi(ptr addrspace(1) %in, i16 %reg, ptr addrspace(1) %out) {
 ; GCN-LABEL: load_global_d16_hi:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-NEXT:    global_load_short_d16_hi v5, v[0:1], off
 ; GCN-NEXT:    s_nop 0
@@ -314,12 +312,11 @@ define void @load_global_d16_hi(ptr addrspace(1) %in, i16 %reg, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    global_store_dword v[3:4], v2, off offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-SCRATCH-LABEL: load_global_d16_hi:
 ; GCN-SCRATCH:       ; %bb.0: ; %entry
-; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
 ; GCN-SCRATCH-NEXT:    global_load_short_d16_hi v5, v[0:1], off
@@ -328,6 +325,7 @@ define void @load_global_d16_hi(ptr addrspace(1) %in, i16 %reg, ptr addrspace(1)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
+; GCN-SCRATCH-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
@@ -346,7 +344,6 @@ entry:
 define void @load_global_d16_lo(ptr addrspace(1) %in, i32 %reg, ptr addrspace(1) %out) {
 ; GCN-LABEL: load_global_d16_lo:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-NEXT:    global_load_short_d16 v5, v[0:1], off
 ; GCN-NEXT:    s_nop 0
@@ -355,12 +352,11 @@ define void @load_global_d16_lo(ptr addrspace(1) %in, i32 %reg, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    global_store_dword v[3:4], v2, off offset:128
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-SCRATCH-LABEL: load_global_d16_lo:
 ; GCN-SCRATCH:       ; %bb.0: ; %entry
-; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v5, v2
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
 ; GCN-SCRATCH-NEXT:    global_load_short_d16 v5, v[0:1], off
@@ -369,6 +365,7 @@ define void @load_global_d16_lo(ptr addrspace(1) %in, i32 %reg, ptr addrspace(1)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v5, off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    global_store_dword v[3:4], v2, off offset:128
+; GCN-SCRATCH-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 32
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 32b61aa1145db4b..263507b6dd2e8da 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -5,8 +5,8 @@
 define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
@@ -46,8 +46,8 @@ define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %
 define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_commuted_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %c, i32 %smax)
@@ -57,8 +57,8 @@ define i32 @test_minmax_commuted_i32(i32 %a, i32 %b, i32 %c) {
 define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_maxmin_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %smin, i32 %c)
@@ -68,8 +68,8 @@ define i32 @test_maxmin_i32(i32 %a, i32 %b, i32 %c) {
 define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_maxmin_commuted_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_i32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   %smaxmin = call i32 @llvm.smax.i32(i32 %c, i32 %smin)
@@ -79,9 +79,9 @@ define i32 @test_maxmin_commuted_i32(i32 %a, i32 %b, i32 %c) {
 define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-LABEL: test_smed3_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_med3_i32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.smin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.smax.i32(i32 %x, i32 %y)
@@ -94,8 +94,8 @@ define void @test_smed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_u32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %umax, i32 %c)
@@ -135,8 +135,8 @@ define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %
 define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_minmax_commuted_u32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_maxmin_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %uminmax = call i32 @llvm.umin.i32(i32 %c, i32 %umax)
@@ -146,8 +146,8 @@ define i32 @test_minmax_commuted_u32(i32 %a, i32 %b, i32 %c) {
 define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_maxmin_u32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %umin, i32 %c)
@@ -157,8 +157,8 @@ define i32 @test_maxmin_u32(i32 %a, i32 %b, i32 %c) {
 define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
 ; GFX11-LABEL: test_maxmin_commuted_u32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_minmax_u32 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   %umaxmin = call i32 @llvm.umax.i32(i32 %c, i32 %umin)
@@ -168,9 +168,9 @@ define i32 @test_maxmin_commuted_u32(i32 %a, i32 %b, i32 %c) {
 define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 ; GFX11-LABEL: test_umed3_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_med3_u32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call i32 @llvm.umin.i32(i32 %x, i32 %y)
   %tmp1 = call i32 @llvm.umax.i32(i32 %x, i32 %y)
@@ -183,18 +183,18 @@ define void @test_umed3_i32(ptr addrspace(1) %arg, i32 %x, i32 %y, i32 %z) {
 define float @test_minmax_f32_ieee_true(float %a, float %b, float %c) {
 ; SDAG-LABEL: test_minmax_f32_ieee_true:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_minmax_f32_ieee_true:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maxnum.f32(float %a, float %b)
   %minmax = call float @llvm.minnum.f32(float %max, float %c)
@@ -242,18 +242,18 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
 define float @test_maxmin_f32_ieee_true(float %a, float %b, float %c) {
 ; SDAG-LABEL: test_maxmin_f32_ieee_true:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
 ; SDAG-NEXT:    v_max_f32_e32 v2, v2, v2
 ; SDAG-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_maxmin_f32_ieee_true:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
 ; GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GISEL-NEXT:    v_minmax_f32 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minnum.f32(float %a, float %b)
   %maxmin = call float @llvm.maxnum.f32(float %min, float %c)
@@ -273,9 +273,9 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
 define void @test_med3_f32(ptr addrspace(1) %arg, float %x, float %y, float %z) #0 {
 ; GFX11-LABEL: test_med3_f32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_med3_f32 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call float @llvm.minnum.f32(float %x, float %y)
   %tmp1 = call float @llvm.maxnum.f32(float %x, float %y)
@@ -326,20 +326,20 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 define half @test_minmax_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-LABEL: test_minmax_commuted_f16_ieee_true:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
 ; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
 ; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
 ; SDAG-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_minmax_commuted_f16_ieee_true:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GISEL-NEXT:    v_maxmin_f16 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %max = call half @llvm.maxnum.f16(half %a, half %b)
   %minmax = call half @llvm.minnum.f16(half %c, half %max)
@@ -359,20 +359,20 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
 ; SDAG-LABEL: test_maxmin_commuted_f16_ieee_true:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
 ; SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
 ; SDAG-NEXT:    v_max_f16_e32 v2, v2, v2
 ; SDAG-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_maxmin_commuted_f16_ieee_true:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
 ; GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
 ; GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
 ; GISEL-NEXT:    v_minmax_f16 v0, v0, v1, v2
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %min = call half @llvm.minnum.f16(half %a, half %b)
   %maxmin = call half @llvm.maxnum.f16(half %c, half %min)
@@ -382,9 +382,9 @@ define half @test_maxmin_commuted_f16_ieee_true(half %a, half %b, half %c) {
 define void @test_med3_f16(ptr addrspace(1) %arg, half %x, half %y, half %z) #0 {
 ; GFX11-LABEL: test_med3_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_med3_f16 v2, v2, v3, v4
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp0 = call half @llvm.minnum.f16(half %x, half %y)
   %tmp1 = call half @llvm.maxnum.f16(half %x, half %y)
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index e557e0ce9b1be5e..ae419bf2e3d62a8 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -139,46 +139,42 @@ define amdgpu_kernel void @nocall_false_sharing() {
 define void @nonkernel() {
 ; GFX9-LABEL: nonkernel:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
 ; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: nonkernel:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
 ; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; G_GFX9-LABEL: nonkernel:
 ; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX9-NEXT:    ds_write_b32 v3, v2
 ; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; G_GFX10-LABEL: nonkernel:
 ; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
 ; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; G_GFX10-NEXT:    ds_write_b32 v3, v2
 ; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
   store i32 0, ptr addrspace(3) @used_by_both
   store double 0.0, ptr addrspace(3) @used_by_function
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index fd261e8e2bb25e3..985f50902a548d9 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -12,7 +12,6 @@
 define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr:
 ; GFX9_W64:       ; %bb.0:
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -22,8 +21,8 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr4
@@ -33,11 +32,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX9_W64-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr:
 ; GFX1010_W32:       ; %bb.0:
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1010_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
@@ -47,6 +46,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX1010_W32-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -58,11 +58,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr:
 ; GFX1010_W64:       ; %bb.0:
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1010_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -72,6 +72,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -83,11 +84,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1010_W64-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr:
 ; GFX1100_W32:       ; %bb.0:
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1100_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
@@ -97,8 +98,9 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -109,11 +111,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr:
 ; GFX1100_W64:       ; %bb.0:
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1100_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
@@ -123,8 +125,9 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -135,11 +138,12 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr:
 ; W64-O0:       ; %bb.0:
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -245,7 +249,7 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
   %call = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1
   ret float %call
@@ -258,7 +262,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX9_W64:       ; %bb.0: ; %entry
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -268,8 +271,8 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -298,12 +301,11 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX9_W64-NEXT:    global_store_dword v[9:10], v13, off
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    global_store_dword v[11:12], v0, off
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1010_W32:       ; %bb.0: ; %entry
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1010_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
@@ -313,6 +315,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX1010_W32-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -345,11 +348,11 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    global_store_dword v[11:12], v0, off
 ; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1010_W64:       ; %bb.0: ; %entry
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1010_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -359,6 +362,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -391,11 +395,11 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    global_store_dword v[11:12], v0, off
 ; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1100_W32:       ; %bb.0: ; %entry
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1100_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
@@ -405,8 +409,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -440,11 +445,11 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v0, off dlc
 ; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1100_W64:       ; %bb.0: ; %entry
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1100_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
@@ -454,8 +459,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -489,11 +495,12 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v0, off dlc
 ; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr_adjacent_in_block:
 ; W64-O0:       ; %bb.0: ; %entry
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -713,7 +720,7 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %i, i32 %c, i32 0, i32 0, i32 0) #1
@@ -729,7 +736,6 @@ entry:
 define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX9_W64:       ; %bb.0: ; %entry
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    ;;#ASMSTART
 ; GFX9_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX9_W64-NEXT:    ;;#ASMEND
@@ -743,8 +749,8 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr8
@@ -780,12 +786,11 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX9_W64-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    global_store_dword v[11:12], v9, off
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1010_W32:       ; %bb.0: ; %entry
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    ;;#ASMSTART
 ; GFX1010_W32-NEXT:    s_mov_b32 s4, 17
 ; GFX1010_W32-NEXT:    ;;#ASMEND
@@ -799,6 +804,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s5, vcc_lo, s5
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s5, s5
 ; GFX1010_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -837,11 +843,11 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    global_store_dword v[11:12], v9, off
 ; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1010_W64:       ; %bb.0: ; %entry
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    ;;#ASMSTART
 ; GFX1010_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX1010_W64-NEXT:    ;;#ASMEND
@@ -855,6 +861,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -893,11 +900,11 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    global_store_dword v[11:12], v9, off
 ; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1100_W32:       ; %bb.0: ; %entry
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    ;;#ASMSTART
 ; GFX1100_W32-NEXT:    s_mov_b32 s4, 17
 ; GFX1100_W32-NEXT:    ;;#ASMEND
@@ -911,8 +918,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -953,11 +961,11 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v9, off dlc
 ; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1100_W64:       ; %bb.0: ; %entry
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    ;;#ASMSTART
 ; GFX1100_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX1100_W64-NEXT:    ;;#ASMEND
@@ -971,8 +979,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -1013,11 +1022,12 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v9, off dlc
 ; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr_outside_entry:
 ; W64-O0:       ; %bb.0: ; %entry
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -1267,7 +1277,7 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 72fbcc7bd9e40f5..d0ac52ed80497bb 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -11,7 +11,6 @@
 define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr:
 ; GFX9_W64:       ; %bb.0:
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -21,8 +20,8 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr4
@@ -32,11 +31,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX9_W64-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr:
 ; GFX1010_W32:       ; %bb.0:
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1010_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
@@ -46,6 +45,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX1010_W32-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -57,11 +57,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1010_W32-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr:
 ; GFX1010_W64:       ; %bb.0:
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1010_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -71,6 +71,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v5, v4, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -82,11 +83,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1010_W64-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr:
 ; GFX1100_W32:       ; %bb.0:
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1100_W32-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
@@ -96,8 +97,9 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -108,11 +110,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1100_W32-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr:
 ; GFX1100_W64:       ; %bb.0:
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1100_W64-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
@@ -122,8 +124,9 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v5, v4, s[4:7], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -134,11 +137,12 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; GFX1100_W64-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    v_mov_b32_e32 v0, v5
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr:
 ; W64-O0:       ; %bb.0:
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
@@ -259,7 +263,7 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_nop 0
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
   %call = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1
   ret float %call
@@ -272,7 +276,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %out0, ptr addrspace(1) %out1) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX9_W64:       ; %bb.0: ; %entry
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -282,8 +285,8 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    s_xor_b64 exec, exec, s[4:5]
@@ -312,12 +315,11 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX9_W64-NEXT:    global_store_dword v[9:10], v13, off
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    global_store_dword v[11:12], v0, off
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1010_W32:       ; %bb.0: ; %entry
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1010_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W32-NEXT:    v_readfirstlane_b32 s8, v0
@@ -327,6 +329,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX1010_W32-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -359,11 +362,11 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    global_store_dword v[11:12], v0, off
 ; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1010_W64:       ; %bb.0: ; %entry
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1010_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1010_W64-NEXT:    v_readfirstlane_b32 s8, v0
@@ -373,6 +376,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v13, v8, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -405,11 +409,11 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    global_store_dword v[11:12], v0, off
 ; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1100_W32:       ; %bb.0: ; %entry
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1100_W32-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W32-NEXT:    v_readfirstlane_b32 s4, v0
@@ -419,8 +423,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -454,11 +459,11 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v0, off dlc
 ; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr_adjacent_in_block:
 ; GFX1100_W64:       ; %bb.0: ; %entry
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1100_W64-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1100_W64-NEXT:    v_readfirstlane_b32 s4, v0
@@ -468,8 +473,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v13, v8, s[4:7], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -503,11 +509,12 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v0, off dlc
 ; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr_adjacent_in_block:
 ; W64-O0:       ; %bb.0: ; %entry
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -752,7 +759,7 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %i, i32 %c, i32 0, i32 0, i32 0) #1
@@ -768,7 +775,6 @@ entry:
 define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j, i32 %c, ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
 ; GFX9_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX9_W64:       ; %bb.0: ; %entry
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    ;;#ASMSTART
 ; GFX9_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX9_W64-NEXT:    ;;#ASMEND
@@ -782,8 +788,8 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX9_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
 ; GFX9_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX9_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX9_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
-; GFX9_W64-NEXT:    s_nop 0
 ; GFX9_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9_W64-NEXT:    ; implicit-def: $vgpr8
@@ -819,12 +825,11 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX9_W64-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9_W64-NEXT:    global_store_dword v[11:12], v9, off
-; GFX9_W64-NEXT:    s_waitcnt vmcnt(0)
+; GFX9_W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W32-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1010_W32:       ; %bb.0: ; %entry
-; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W32-NEXT:    ;;#ASMSTART
 ; GFX1010_W32-NEXT:    s_mov_b32 s4, 17
 ; GFX1010_W32-NEXT:    ;;#ASMEND
@@ -838,6 +843,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1010_W32-NEXT:    v_cmp_eq_u64_e64 s5, s[10:11], v[2:3]
 ; GFX1010_W32-NEXT:    s_and_b32 s5, vcc_lo, s5
+; GFX1010_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W32-NEXT:    s_and_saveexec_b32 s5, s5
 ; GFX1010_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1010_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -876,11 +882,11 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1010_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W32-NEXT:    global_store_dword v[11:12], v9, off
 ; GFX1010_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1010_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1010_W64:       ; %bb.0: ; %entry
-; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1010_W64-NEXT:    ;;#ASMSTART
 ; GFX1010_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX1010_W64-NEXT:    ;;#ASMEND
@@ -894,6 +900,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1010_W64-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[10:11], v[2:3]
 ; GFX1010_W64-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; GFX1010_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1010_W64-NEXT:    s_and_saveexec_b64 s[6:7], s[6:7]
 ; GFX1010_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1010_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -932,11 +939,11 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1010_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010_W64-NEXT:    global_store_dword v[11:12], v9, off
 ; GFX1010_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W32-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1100_W32:       ; %bb.0: ; %entry
-; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W32-NEXT:    ;;#ASMSTART
 ; GFX1100_W32-NEXT:    s_mov_b32 s4, 17
 ; GFX1100_W32-NEXT:    ;;#ASMEND
@@ -950,8 +957,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
 ; GFX1100_W32-NEXT:    v_cmp_eq_u64_e64 s0, s[10:11], v[2:3]
-; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W32-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX1100_W32-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W32-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX1100_W32-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1100_W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -992,11 +1000,11 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1100_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W32-NEXT:    global_store_b32 v[11:12], v9, off dlc
 ; GFX1100_W32-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100_W64-LABEL: mubuf_vgpr_outside_entry:
 ; GFX1100_W64:       ; %bb.0: ; %entry
-; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100_W64-NEXT:    ;;#ASMSTART
 ; GFX1100_W64-NEXT:    s_mov_b32 s4, 17
 ; GFX1100_W64-NEXT:    ;;#ASMEND
@@ -1010,8 +1018,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GFX1100_W64-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3]
-; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1100_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100_W64-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX1100_W64-NEXT:    s_waitcnt expcnt(0)
 ; GFX1100_W64-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX1100_W64-NEXT:    buffer_load_format_x v9, v8, s[8:11], 0 idxen
 ; GFX1100_W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
@@ -1052,11 +1061,12 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; GFX1100_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1100_W64-NEXT:    global_store_b32 v[11:12], v9, off dlc
 ; GFX1100_W64-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1100_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1100_W64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; W64-O0-LABEL: mubuf_vgpr_outside_entry:
 ; W64-O0:       ; %bb.0: ; %entry
-; W64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; W64-O0-NEXT:    s_waitcnt expcnt(0)
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
@@ -1344,7 +1354,7 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; W64-O0-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 179b0a855182ce2..5738708ee15f06e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -2647,32 +2647,32 @@ entry:
 define i32 @mul_pow2_plus_1(i32 %val) {
 ; SI-LABEL: mul_pow2_plus_1:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_pow2_plus_1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_lo_u32 v0, v0, 9
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_pow2_plus_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: mul_pow2_plus_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_pow2_plus_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: mul_pow2_plus_1:
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 797b13044e7226e..394308b0e730e1b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -9,7 +9,6 @@
 define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 %arg16) #0 {
 ; GFX9-LABEL: lsr_order_mul24_0:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v1
@@ -24,6 +23,7 @@ define void @lsr_order_mul24_0(i32 %arg, i32 %arg2, i32 %arg6, i32 %arg13, i32 %
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v0
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX9-NEXT:  ; %bb.2: ; %.loopexit
@@ -53,10 +53,10 @@ bb23:                                             ; preds = %bb23, %bb
 define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) nocapture %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, ptr addrspace(1) nocapture readonly %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, i32 %arg16, i1 zeroext %arg17, i1 zeroext %arg18) #0 {
 ; GFX9-LABEL: lsr_order_mul24_1:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v5, 1, v18
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cmp_lt_u32_e64 s[4:5], v0, v1
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:  ; %bb.1: ; %bb19
@@ -102,7 +102,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-NEXT:    s_cbranch_execnz .LBB1_2
 ; GFX9-NEXT:  .LBB1_3: ; %Flow2
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = icmp ult i32 %arg, %arg1
@@ -150,7 +150,6 @@ bb23:                                             ; preds = %bb19, %bb23
 define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-LABEL: slsr1_0:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v0, v1
 ; GFX9-NEXT:    global_store_dword v[0:1], v3, off
@@ -160,7 +159,7 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %b = and i32 %b.arg, 16777215
   %s = and i32 %s.arg, 16777215
@@ -187,9 +186,9 @@ define void @slsr1_0(i32 %b.arg, i32 %s.arg) #0 {
 define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-LABEL: slsr1_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index c3529debe693d82..bff78b854d8ceaa 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -186,26 +186,26 @@ entry:
 define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
 ; SI-LABEL: test_smul48_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
 ; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; SI-NEXT:    v_mov_b32_e32 v0, v3
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: test_smul48_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_smul48_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: test_smul48_i64:
@@ -228,7 +228,6 @@ define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
 define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; SI-LABEL: test_smul48_v2i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
@@ -241,11 +240,11 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v2
 ; SI-NEXT:    v_mul_i32_i24_e32 v2, v5, v6
 ; SI-NEXT:    v_mul_hi_i32_i24_e32 v3, v5, v6
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: test_smul48_v2i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
@@ -258,11 +257,11 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
 ; VI-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
 ; VI-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_smul48_v2i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; GFX9-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
@@ -275,6 +274,7 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: test_smul48_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index ffc533decc04287..7382ab17593c8dc 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -554,10 +554,10 @@ entry:
 define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 ; GCN-LABEL: test_umul48_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_u32_u24_e32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lhs24 = and i64 %lhs, 16777215
   %rhs24 = and i64 %rhs, 16777215
@@ -568,13 +568,13 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) {
 define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; GCN-LABEL: test_umul48_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_u32_u24_e32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32_u24_e32 v1, v0, v4
 ; GCN-NEXT:    v_mul_u32_u24_e32 v4, v2, v6
 ; GCN-NEXT:    v_mul_hi_u32_u24_e32 v3, v2, v6
 ; GCN-NEXT:    v_mov_b32_e32 v0, v5
 ; GCN-NEXT:    v_mov_b32_e32 v2, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %lhs24 = and <2 x i64> %lhs, <i64 16777215, i64 16777215>
   %rhs24 = and <2 x i64> %rhs, <i64 16777215, i64 16777215>
@@ -809,12 +809,12 @@ entry:
 define i17 @test_umul24_anyextend_i24_src0_src1(i24 %a, i24 %b) {
 ; GCN-LABEL: test_umul24_anyextend_i24_src0_src1:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0xea, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v1, 0x39b, v1
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffe, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x63, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %aa = mul i24 %a, 234
@@ -830,7 +830,6 @@ entry:
 define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
 ; GCN-LABEL: test_umul24_anyextend_i23_src0_src1:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffff, v1
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0xea, v0
@@ -840,6 +839,7 @@ define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x1fffe, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x63, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %aa = mul i23 %a, 234
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index f4114a01e9b4863..864deb7632dc0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -5,7 +5,6 @@
 define hidden fastcc void @callee_has_fp() #1 {
 ; CHECK-LABEL: callee_has_fp:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0x200
@@ -14,6 +13,7 @@ define hidden fastcc void @callee_has_fp() #1 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffe00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 1, ptr addrspace(5) %alloca
@@ -26,9 +26,9 @@ define hidden fastcc void @callee_has_fp() #1 {
 define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-LABEL: csr_vgpr_spill_fp_callee:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s18, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
@@ -107,7 +107,7 @@ bb:
 define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 {
 ; CHECK-LABEL: csr_vgpr_spill_fp_tailcall_callee:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
@@ -174,11 +174,11 @@ bb:
 define hidden i32 @tail_call() #1 {
 ; CHECK-LABEL: tail_call:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    s_mov_b32 s33, s4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret i32 0
@@ -187,9 +187,9 @@ entry:
 define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s18, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
@@ -221,9 +221,9 @@ entry:
 define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-LABEL: caller_save_vgpr_spill_fp:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s19, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[16:17]
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 9999cb9173b5db0..5d2751589718010 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -12,9 +12,9 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    .cfi_sections .debug_frame
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index dcaf7664d5b5aa0..02c73618843a656 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -12,25 +12,22 @@
 define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:1
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load i8, ptr %gep, align 4
@@ -40,25 +37,22 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
@@ -68,25 +62,22 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
@@ -96,47 +87,42 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
@@ -146,29 +132,26 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -2048
   %load = load i8, ptr %gep, align 4
@@ -178,29 +161,26 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
@@ -210,29 +190,26 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
@@ -242,25 +219,22 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
@@ -270,47 +244,42 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
@@ -320,47 +289,42 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 16383
   %load = load i8, ptr %gep, align 4
@@ -370,29 +334,26 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
@@ -402,29 +363,26 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
@@ -434,29 +392,26 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
@@ -467,34 +422,30 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -502,12 +453,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -515,12 +465,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -529,7 +478,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936639
   %load = load i8, ptr %gep, align 4
@@ -540,34 +489,30 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:2048
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -575,12 +520,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -588,12 +532,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -602,7 +545,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589936640
   %load = load i8, ptr %gep, align 4
@@ -613,34 +556,30 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -648,12 +587,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -661,12 +599,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -675,7 +612,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938687
   %load = load i8, ptr %gep, align 4
@@ -686,34 +623,30 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -721,12 +654,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -734,12 +666,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -748,7 +679,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589938688
   %load = load i8, ptr %gep, align 4
@@ -759,34 +690,30 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -794,12 +721,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -807,12 +733,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -821,7 +746,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942783
   %load = load i8, ptr %gep, align 4
@@ -832,34 +757,30 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -867,12 +788,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -880,12 +800,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -894,7 +813,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8589942784
   %load = load i8, ptr %gep, align 4
@@ -905,35 +824,31 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -941,12 +856,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -954,12 +868,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -968,7 +881,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773761
   %load = load i8, ptr %gep, align 4
@@ -979,35 +892,31 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1015,12 +924,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1028,12 +936,11 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1042,7 +949,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854773760
   %load = load i8, ptr %gep, align 4
@@ -1053,35 +960,31 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1089,12 +992,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1102,12 +1004,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1116,7 +1017,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771713
   %load = load i8, ptr %gep, align 4
@@ -1127,35 +1028,31 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1163,12 +1060,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1176,12 +1072,11 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1190,7 +1085,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854771712
   %load = load i8, ptr %gep, align 4
@@ -1201,35 +1096,31 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1237,12 +1128,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1250,12 +1140,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1264,7 +1153,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767617
   %load = load i8, ptr %gep, align 4
@@ -1275,35 +1164,31 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1311,12 +1196,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1324,12 +1208,11 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1338,7 +1221,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -9223372036854767616
   %load = load i8, ptr %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 137c83a0fd80c96..a7d318d039f714f 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -12,23 +12,20 @@
 define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -38,23 +35,20 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -64,34 +58,30 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -101,56 +91,50 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -160,23 +144,20 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_neg_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -186,25 +167,22 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -214,29 +192,26 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -246,34 +221,30 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -283,56 +254,50 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -342,56 +307,50 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -401,25 +360,22 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -429,29 +385,26 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -461,29 +414,26 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
 ; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -494,7 +444,6 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -502,12 +451,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -515,12 +463,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -529,34 +476,31 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -567,7 +511,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -575,12 +518,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -588,12 +530,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -602,34 +543,31 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -640,7 +578,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -648,12 +585,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -661,12 +597,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -675,34 +610,31 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -713,7 +645,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -721,12 +652,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -734,12 +664,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -748,34 +677,31 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -786,7 +712,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -794,12 +719,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -807,12 +731,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -821,34 +744,31 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -859,7 +779,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -867,12 +786,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX10-GISEL-NEXT:    s_mov_b32 s5, 2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -880,12 +798,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 2
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -894,34 +811,31 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -932,7 +846,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -940,12 +853,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x7ff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -953,12 +865,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x7ff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -967,35 +878,32 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2049
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1006,7 +914,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1014,12 +921,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x800
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1027,12 +933,11 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x800
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1041,35 +946,32 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1080,7 +982,6 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1088,12 +989,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xfff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1101,12 +1001,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1115,35 +1014,32 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1154,7 +1050,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1162,12 +1057,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1175,12 +1069,11 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1000
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1189,35 +1082,32 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1228,7 +1118,6 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1236,12 +1125,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x1fff
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1249,12 +1137,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x1fff
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1263,35 +1150,32 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load i8, ptr addrspace(1) %gep, align 4
@@ -1302,7 +1186,6 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX9-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1310,12 +1193,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x2000
 ; GFX10-GISEL-NEXT:    s_brev_b32 s5, 1
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
@@ -1323,12 +1205,11 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
 ; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x2000
 ; GFX11-GISEL-NEXT:    s_brev_b32 s1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1337,35 +1218,32 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
 ; GFX9-SDAG-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
 ; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
 ; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load i8, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
index b46476f34cbe2fa..d648c0357119b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll
@@ -6,8 +6,8 @@
 define i16 @trunc_srl_i64_16_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_srl_i64_16_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i64 %x, 16
   %trunc = trunc i64 %shift to i16
@@ -17,8 +17,8 @@ define i16 @trunc_srl_i64_16_to_i16(i64 %x) {
 define i16 @trunc_srl_i64_17_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_srl_i64_17_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i64 %x, 17
   %trunc = trunc i64 %shift to i16
@@ -28,9 +28,9 @@ define i16 @trunc_srl_i64_17_to_i16(i64 %x) {
 define i15 @trunc_srl_i55_16_to_i15(i55 %x) {
 ; GCN-LABEL: trunc_srl_i55_16_to_i15:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 15, v0
 ; GCN-NEXT:    v_add_u16_e32 v0, 4, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i55 %x, 15
   %trunc = trunc i55 %shift to i15
@@ -41,8 +41,8 @@ define i15 @trunc_srl_i55_16_to_i15(i55 %x) {
 define i16 @trunc_sra_i64_16_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_sra_i64_16_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = ashr i64 %x, 16
   %trunc = trunc i64 %shift to i16
@@ -52,8 +52,8 @@ define i16 @trunc_sra_i64_16_to_i16(i64 %x) {
 define i16 @trunc_sra_i64_17_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_sra_i64_17_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b64 v[0:1], 17, v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = ashr i64 %x, 17
   %trunc = trunc i64 %shift to i16
@@ -63,8 +63,8 @@ define i16 @trunc_sra_i64_17_to_i16(i64 %x) {
 define i16 @trunc_shl_i64_16_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_shl_i64_16_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = shl i64 %x, 16
   %trunc = trunc i64 %shift to i16
@@ -74,8 +74,8 @@ define i16 @trunc_shl_i64_16_to_i16(i64 %x) {
 define i16 @trunc_shl_i64_17_to_i16(i64 %x) {
 ; GCN-LABEL: trunc_shl_i64_17_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = shl i64 %x, 17
   %trunc = trunc i64 %shift to i16
@@ -85,9 +85,9 @@ define i16 @trunc_shl_i64_17_to_i16(i64 %x) {
 define <2 x i16> @trunc_srl_v2i64_16_to_v2i16(<2 x i64> %x) {
 ; GCN-LABEL: trunc_srl_v2i64_16_to_v2i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, 0x7060302
 ; GCN-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr <2 x i64> %x, <i64 16, i64 16>
   %trunc = trunc <2 x i64> %shift to <2 x i16>
@@ -114,9 +114,9 @@ define amdgpu_kernel void @s_trunc_srl_i64_16_to_i16(i64 %x) {
 define i16 @trunc_srl_i64_var_mask15_to_i16(i64 %x, i64 %amt) {
 ; GCN-LABEL: trunc_srl_i64_var_mask15_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 15, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %amt.masked = and i64 %amt, 15
   %shift = lshr i64 %x, %amt.masked
@@ -127,9 +127,9 @@ define i16 @trunc_srl_i64_var_mask15_to_i16(i64 %x, i64 %amt) {
 define i16 @trunc_srl_i64_var_mask16_to_i16(i64 %x, i64 %amt) {
 ; GCN-LABEL: trunc_srl_i64_var_mask16_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %amt.masked = and i64 %amt, 16
   %shift = lshr i64 %x, %amt.masked
@@ -140,9 +140,9 @@ define i16 @trunc_srl_i64_var_mask16_to_i16(i64 %x, i64 %amt) {
 define i16 @trunc_srl_i64_var_mask31_to_i16(i64 %x, i64 %amt) {
 ; GCN-LABEL: trunc_srl_i64_var_mask31_to_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v2, 31, v2
 ; GCN-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %amt.masked = and i64 %amt, 31
   %shift = lshr i64 %x, %amt.masked
@@ -153,10 +153,10 @@ define i16 @trunc_srl_i64_var_mask31_to_i16(i64 %x, i64 %amt) {
 define i32 @trunc_srl_i64_25_to_i26(i64 %x) {
 ; GCN-LABEL: trunc_srl_i64_25_to_i26:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xa000000, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
 ; GCN-NEXT:    v_add_u32_e32 v0, 55, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %value.knownbits2 = and i64 %x, 167772160 ; 0xA000000
   %shift = lshr i64 %value.knownbits2, 25
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 5f896f92de0f424..de5abf518935399 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -5,22 +5,21 @@
 define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle6766:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x6060706
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle6766:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x6060706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -32,24 +31,23 @@ define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle3744:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x307
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle3744:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_movk_i32 s4, 0x307
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -61,22 +59,21 @@ define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle4445:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040404
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle4445:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040404
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -88,22 +85,21 @@ define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle0101:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle0101:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -115,24 +111,23 @@ define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle1004:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x40405
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle1004:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40405
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -146,25 +141,23 @@ define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
 ; GFX10-LABEL: shuffle7533:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v6, v[0:1]
 ; GFX10-NEXT:    flat_load_dword v7, v[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3030507
 ; GFX10-NEXT:    flat_store_dword v[4:5], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle7533:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_dword v6, v[0:1]
 ; GFX9-NEXT:    flat_load_dword v7, v[2:3]
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3030507
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    flat_store_dword v[4:5], v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
@@ -176,23 +169,21 @@ define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, pt
 define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
 ; GFX10-LABEL: shuffle7767:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060707
 ; GFX10-NEXT:    flat_store_dword v[4:5], v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle7767:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    flat_load_dword v0, v[2:3]
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060707
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    flat_store_dword v[4:5], v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
@@ -204,25 +195,23 @@ define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, pt
 define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
 ; GFX10-LABEL: shuffle0554:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v0, v0
 ; GFX10-NEXT:    ds_read_b32 v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x10104
 ; GFX10-NEXT:    ds_write_b32 v2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle0554:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v0, v0
 ; GFX9-NEXT:    ds_read_b32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x10104
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    ds_write_b32 v2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
@@ -234,25 +223,23 @@ define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, pt
 define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) {
 ; GFX10-LABEL: shuffle2127:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v0, v0
 ; GFX10-NEXT:    ds_read_b32 v1, v1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3060506
 ; GFX10-NEXT:    ds_write_b32 v2, v0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle2127:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v0, v0
 ; GFX9-NEXT:    ds_read_b32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3060506
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    ds_write_b32 v2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4
@@ -264,25 +251,24 @@ define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, pt
 define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) {
 ; GFX10-LABEL: shuffle5047:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v3, 0x7040005
 ; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle5047:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7040005
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v3, s4
 ; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4
@@ -294,24 +280,23 @@ define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, pt
 define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle3546:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2000107
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle3546:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2000107
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -324,22 +309,21 @@ define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle7330ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x4070706
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle7330ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x4070706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
@@ -350,21 +334,20 @@ define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle5341ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle5341ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1>
@@ -375,22 +358,21 @@ define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle6106ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle6106ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
@@ -402,22 +384,21 @@ define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle4327ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle4327ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7>
@@ -428,22 +409,21 @@ define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle3263ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060607
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle3263ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060607
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3>
@@ -454,22 +434,21 @@ define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle2763ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060706
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle2763ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3>
@@ -480,22 +459,21 @@ define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle1327ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060705
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle1327ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060705
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7>
@@ -506,22 +484,21 @@ define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: shuffle0605ud2:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040504
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shuffle0605ud2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040504
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    global_store_dword v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
@@ -532,25 +509,24 @@ define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0
 define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: insertUsesOr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: insertUsesOr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -563,7 +539,6 @@ define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: addUsesOr:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -581,11 +556,11 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: addUsesOr:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -594,7 +569,7 @@ define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -672,7 +647,6 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: add:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -692,11 +666,11 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
@@ -710,7 +684,7 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -724,7 +698,6 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p
 define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: add_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -744,11 +717,11 @@ define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %el
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -763,7 +736,7 @@ define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %el
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -780,7 +753,6 @@ define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %el
 define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: add_store:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -799,11 +771,11 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_store:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX9-NEXT:    s_movk_i32 s4, 0xff00
@@ -819,7 +791,7 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -834,7 +806,6 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %
 define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: add_store_div_16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -859,11 +830,11 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_store_div_16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -885,7 +856,7 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -903,7 +874,6 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in
 define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: add_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -931,11 +901,11 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x10705
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -958,7 +928,7 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -975,7 +945,6 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: and_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -998,11 +967,11 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5070006
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: and_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1026,7 +995,7 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1043,7 +1012,6 @@ define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: ashr_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1068,11 +1036,11 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ashr_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1098,7 +1066,7 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1115,7 +1083,6 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: bc_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1128,11 +1095,11 @@ define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x7060104
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bc_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1146,7 +1113,7 @@ define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1164,7 +1131,6 @@ define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
 ; GFX10-LABEL: eve_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1179,11 +1145,11 @@ define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v5, v4, 0x1020305
 ; GFX10-NEXT:    global_store_byte v[9:10], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: eve_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1199,7 +1165,7 @@ define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    global_store_byte v[9:10], v1, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1217,7 +1183,6 @@ define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: ive_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v9
@@ -1239,11 +1204,11 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v10, v9, 0x2000706
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: ive_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v9
@@ -1266,7 +1231,7 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v3, v10, v9, s5
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1284,7 +1249,6 @@ define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: lhsr_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1307,11 +1271,11 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x1030707
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: lhsr_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1335,7 +1299,7 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1353,7 +1317,6 @@ define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: mul_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1380,11 +1343,11 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2000504
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1407,7 +1370,7 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v1, v9, v4, s4
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1425,7 +1388,6 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: or_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1448,11 +1410,11 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x2010005
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: or_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1476,7 +1438,7 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1493,7 +1455,6 @@ define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: sdiv_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1566,11 +1527,11 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: sdiv_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1644,7 +1605,7 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1662,7 +1623,6 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: sext_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1683,11 +1643,11 @@ define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v2, v9, v4, 0x3010707
 ; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
 ; GFX10-NEXT:    global_store_dword v[5:6], v2, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: sext_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1708,7 +1668,7 @@ define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v2, v4, v9, s4
 ; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
 ; GFX9-NEXT:    global_store_dword v[5:6], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1726,7 +1686,6 @@ define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: shl_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -1749,11 +1708,11 @@ define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shl_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1777,7 +1736,7 @@ define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1795,7 +1754,6 @@ define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: sitofp_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1819,11 +1777,11 @@ define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: sitofp_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -1848,7 +1806,7 @@ define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_perm_b32 v4, v4, v9, s4
 ; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    global_store_dword v[5:6], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -1866,7 +1824,6 @@ define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: srem_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -1950,11 +1907,11 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2070306
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: srem_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2039,7 +1996,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2057,7 +2014,6 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: sub_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2083,11 +2039,11 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: sub_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -2111,7 +2067,7 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2129,7 +2085,6 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) {
 ; GFX10-LABEL: sv_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2141,11 +2096,11 @@ define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x50705
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: sv_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2158,7 +2113,7 @@ define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2176,7 +2131,6 @@ define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i
 define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: trunc_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2200,11 +2154,11 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-NEXT:    global_store_byte v[7:8], v0, off
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: trunc_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2229,7 +2183,7 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX9-NEXT:    global_store_byte v[7:8], v0, off
 ; GFX9-NEXT:    global_store_dword v[5:6], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2246,7 +2200,6 @@ define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1
 define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: udiv:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2299,11 +2252,11 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: udiv:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2357,7 +2310,7 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2375,7 +2328,6 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: uitofp_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2393,11 +2345,11 @@ define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v9, 0x5020104
 ; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX10-NEXT:    global_store_dword v[5:6], v4, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: uitofp_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2416,7 +2368,7 @@ define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_perm_b32 v10, v9, v4, s4
 ; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
 ; GFX9-NEXT:    global_store_dword v[5:6], v10, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2434,7 +2386,6 @@ define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in
 define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: urem_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@@ -2495,11 +2446,11 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: urem_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2561,7 +2512,7 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2579,7 +2530,6 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: xor_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2605,11 +2555,11 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x5060307
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: xor_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2636,7 +2586,7 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v4, v9, v4, s5
 ; GFX9-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX9-NEXT:    global_store_dword v[7:8], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2654,7 +2604,6 @@ define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
 ; GFX10-LABEL: zext_store_div:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
@@ -2675,11 +2624,11 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v10, 0x5040100
 ; GFX10-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
 ; GFX10-NEXT:    global_store_dword v[5:6], v2, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: zext_store_div:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
@@ -2702,7 +2651,7 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v4, s6
 ; GFX9-NEXT:    global_store_dwordx2 v[7:8], v[0:1], off
 ; GFX9-NEXT:    global_store_dword v[5:6], v2, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid
@@ -2719,18 +2668,17 @@ define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 define void @Source16Bit(i16 %in, <2 x i16> %reg) {
 ; GFX10-LABEL: Source16Bit:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x3050204
 ; GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: Source16Bit:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3050204
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %elt0 = extractelement <2 x i16> %reg, i32 1
@@ -2753,24 +2701,23 @@ entry:
 define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: extract3744:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract3744:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3070404
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -2797,24 +2744,23 @@ declare i32 @llvm.amdgcn.perm(i32, i32, i32)
 define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: extract_perm_3744:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3070404
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_perm_3744:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3070404
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -2830,24 +2776,23 @@ define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %i
 define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: extract1347_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1030407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract1347_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x1030407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -2887,24 +2832,23 @@ declare i16 @llvm.fshr.i16(i16, i16, i16)
 define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshri16_8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshri16_8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -2927,24 +2871,23 @@ define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshri16_16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshri16_16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -2967,24 +2910,23 @@ define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshri16_24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshri16_24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3007,24 +2949,23 @@ define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshri16_32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3020706
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshri16_32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3047,24 +2988,23 @@ define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshri16_88:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshri16_88:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3089,24 +3029,23 @@ declare i16 @llvm.fshl.i16(i16, i16, i16)
 define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshli16_1347:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshli16_1347:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3129,24 +3068,23 @@ define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshli16_16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshli16_16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3169,24 +3107,23 @@ define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshli16_24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshli16_24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3209,24 +3146,23 @@ define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshli16_32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x1000504
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshli16_32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3249,24 +3185,23 @@ define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) {
 ; GFX10-LABEL: fshli16_88:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x30407
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: fshli16_88:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x30407
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4
   %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4
@@ -3289,7 +3224,6 @@ define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr
 define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) {
 ; GFX10-LABEL: shlbase:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v8, v[2:3], off
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 16, v6
@@ -3303,11 +3237,11 @@ define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ad
 ; GFX10-NEXT:    v_lshl_or_b32 v2, v2, v3, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v1
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: shlbase:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v8, v[2:3], off
 ; GFX9-NEXT:    v_add_u32_e32 v0, 8, v6
@@ -3321,7 +3255,7 @@ define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ad
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v3, v0, v3
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
@@ -3350,7 +3284,6 @@ define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr ad
 define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) {
 ; GFX10-LABEL: extractbase:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v8, v[2:3], off
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v6
@@ -3364,11 +3297,11 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extractbase:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v8, v[2:3], off
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v6
@@ -3382,7 +3315,7 @@ define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, pt
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4
   %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index b7949130c9515c7..1138ea7cf8680d9 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -8,26 +8,26 @@
 define i16 @shl_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: shl_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: shl_i16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shl_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shl_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = shl i16 %x, %y
   ret i16 %res
@@ -36,26 +36,26 @@ define i16 @shl_i16(i16 %x, i16 %y) {
 define i16 @lshr_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: lshr_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: lshr_i16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: lshr_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: lshr_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b16 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = lshr i16 %x, %y
   ret i16 %res
@@ -64,26 +64,26 @@ define i16 @lshr_i16(i16 %x, i16 %y) {
 define i16 @ashr_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: ashr_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: ashr_i16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: ashr_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: ashr_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ashrrev_i16 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = ashr i16 %x, %y
   ret i16 %res
@@ -92,26 +92,26 @@ define i16 @ashr_i16(i16 %x, i16 %y) {
 define i16 @add_u16(i16 %x, i16 %y) {
 ; GFX8-LABEL: add_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: add_u16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: add_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = add i16 %x, %y
   ret i16 %res
@@ -120,26 +120,26 @@ define i16 @add_u16(i16 %x, i16 %y) {
 define i16 @sub_u16(i16 %x, i16 %y) {
 ; GFX8-LABEL: sub_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: sub_u16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: sub_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: sub_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = sub i16 %x, %y
   ret i16 %res
@@ -148,26 +148,26 @@ define i16 @sub_u16(i16 %x, i16 %y) {
 define i16 @mul_lo_u16(i16 %x, i16 %y) {
 ; GFX8-LABEL: mul_lo_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: mul_lo_u16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: mul_lo_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_lo_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = mul i16 %x, %y
   ret i16 %res
@@ -176,26 +176,26 @@ define i16 @mul_lo_u16(i16 %x, i16 %y) {
 define i16 @min_u16(i16 %x, i16 %y) {
 ; GFX8-LABEL: min_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: min_u16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: min_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp ule i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -205,26 +205,26 @@ define i16 @min_u16(i16 %x, i16 %y) {
 define i16 @min_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: min_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: min_i16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_i16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: min_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp sle i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -234,26 +234,26 @@ define i16 @min_i16(i16 %x, i16 %y) {
 define i16 @max_u16(i16 %x, i16 %y) {
 ; GFX8-LABEL: max_u16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: max_u16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: max_u16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: max_u16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp uge i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -263,26 +263,26 @@ define i16 @max_u16(i16 %x, i16 %y) {
 define i16 @max_i16(i16 %x, i16 %y) {
 ; GFX8-LABEL: max_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: max_i16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: max_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_i16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: max_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp sge i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -292,28 +292,28 @@ define i16 @max_i16(i16 %x, i16 %y) {
 define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: shl_i16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: shl_i16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_lshlrev_b16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shl_i16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shl_i16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = shl i16 %x, %y
   %zext = zext i16 %res to i32
@@ -323,28 +323,28 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
 define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: lshr_i16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: lshr_i16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: lshr_i16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b16 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: lshr_i16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b16 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = lshr i16 %x, %y
   %zext = zext i16 %res to i32
@@ -354,28 +354,28 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
 define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: ashr_i16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: ashr_i16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_ashrrev_i16_e32 v0, v1, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: ashr_i16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ashrrev_i16 v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: ashr_i16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ashrrev_i16 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = ashr i16 %x, %y
   %zext = zext i16 %res to i32
@@ -385,28 +385,28 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
 define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: add_u16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: add_u16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_add_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: add_u16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_u16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = add i16 %x, %y
   %zext = zext i16 %res to i32
@@ -416,28 +416,28 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
 define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: sub_u16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: sub_u16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_sub_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: sub_u16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: sub_u16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = sub i16 %x, %y
   %zext = zext i16 %res to i32
@@ -447,28 +447,28 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
 define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: mul_lo_u16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: mul_lo_u16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: mul_lo_u16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_lo_u16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %res = mul i16 %x, %y
   %zext = zext i16 %res to i32
@@ -478,28 +478,28 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
 define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: min_u16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: min_u16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_min_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_u16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: min_u16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_u16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp ule i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -510,28 +510,28 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
 define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: min_i16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: min_i16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_min_i16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: min_i16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_min_i16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: min_i16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_min_i16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp sle i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -542,28 +542,28 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
 define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: max_u16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: max_u16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_max_u16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: max_u16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_u16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: max_u16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_u16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp uge i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -574,28 +574,28 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
 define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
 ; GFX8-LABEL: max_i16_zext_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: max_i16_zext_i32:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_max_i16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: max_i16_zext_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_max_i16 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: max_i16_zext_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_max_i16 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp sge i16 %x, %y
   %res = select i1 %cmp, i16 %x, i16 %y
@@ -606,28 +606,28 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
 define i32 @zext_fadd_f16(half %x, half %y) {
 ; GFX8-LABEL: zext_fadd_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: zext_fadd_f16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zext_fadd_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zext_fadd_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd half %x, %y
   %cast = bitcast half %add to i16
@@ -638,29 +638,29 @@ define i32 @zext_fadd_f16(half %x, half %y) {
 define i32 @zext_fma_f16(half %x, half %y, half %z) {
 ; GFX8-LABEL: zext_fma_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: zext_fma_f16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX9ALL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zext_fma_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zext_fma_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fmac_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call half @llvm.fma.f16(half %x, half %y, half %z)
   %cast = bitcast half %fma to i16
@@ -671,29 +671,29 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) {
 define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
 ; GFX8-LABEL: zext_div_fixup_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_div_fixup_f16 v0, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: zext_div_fixup_f16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_div_fixup_f16 v0, v0, v1, v2
 ; GFX9ALL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zext_div_fixup_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v0, v1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zext_div_fixup_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div.fixup = call half @llvm.amdgcn.div.fixup.f16(half %x, half %y, half %z)
   %cast = bitcast half %div.fixup to i16
@@ -707,28 +707,28 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) {
 define i32 @zext_fptrunc_f16(float %x) {
 ; GFX8-LABEL: zext_fptrunc_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9ALL-LABEL: zext_fptrunc_f16:
 ; GFX9ALL:       ; %bb.0:
-; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9ALL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9ALL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zext_fptrunc_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zext_fptrunc_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fptrunc = fptrunc float %x to half
   %cast = bitcast half %fptrunc to i16
@@ -739,37 +739,37 @@ define i32 @zext_fptrunc_f16(float %x) {
 define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
 ; GFX8-LABEL: zext_fptrunc_fma_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: zext_fptrunc_fma_f16:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: zext_fptrunc_fma_f16:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: zext_fptrunc_fma_f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: zext_fptrunc_fma_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fma = call float @llvm.fma.f32(float %x, float %y, float %z)
   %fptrunc = fptrunc float %fma to half
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 8f74132271ba68c..da500a96ba6c4fe 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -9,9 +9,9 @@
 define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-LABEL: preserve_wwm_copy_dstreg:
 ; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_mov_b32 s16, s33
 ; GFX906-NEXT:    s_mov_b32 s33, s32
+; GFX906-NEXT:    s_waitcnt expcnt(0)
 ; GFX906-NEXT:    s_xor_saveexec_b64 s[18:19], -1
 ; GFX906-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
@@ -370,9 +370,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ;
 ; GFX908-LABEL: preserve_wwm_copy_dstreg:
 ; GFX908:       ; %bb.0:
-; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b32 s16, s33
 ; GFX908-NEXT:    s_mov_b32 s33, s32
+; GFX908-NEXT:    s_waitcnt expcnt(0)
 ; GFX908-NEXT:    s_xor_saveexec_b64 s[18:19], -1
 ; GFX908-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 73457654307019e..6dc1ea01ef4b060 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -9,9 +9,8 @@ define i32 @load_atomic_private_seq_cst_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i32, ptr addrspace(5) %ptr seq_cst, align 4
   ret i32 %load
@@ -24,11 +23,10 @@ define i64 @load_atomic_private_seq_cst_i64(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i64, ptr addrspace(5) %ptr seq_cst, align 8
   ret i64 %load
@@ -41,9 +39,8 @@ define void @atomic_store_seq_cst_i32(ptr addrspace(5) %ptr, i32 %val) {
 ;
 ; GCN-LABEL: atomic_store_seq_cst_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   store atomic i32 %val, ptr addrspace(5) %ptr seq_cst, align 4
   ret void
@@ -56,11 +53,10 @@ define void @atomic_store_seq_cst_i64(ptr addrspace(5) %ptr, i64 %val) {
 ;
 ; GCN-LABEL: atomic_store_seq_cst_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
 ; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   store atomic i64 %val, ptr addrspace(5) %ptr seq_cst, align 8
   ret void
@@ -73,9 +69,8 @@ define i32 @load_atomic_private_seq_cst_syncscope_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: load_atomic_private_seq_cst_syncscope_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %load = load atomic i32, ptr addrspace(5) %ptr syncscope("agent") seq_cst, align 4
   ret i32 %load
@@ -88,9 +83,8 @@ define void @atomic_store_seq_cst_syncscope_i32(ptr addrspace(5) %ptr, i32 %val)
 ;
 ; GCN-LABEL: atomic_store_seq_cst_syncscope_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   store atomic i32 %val, ptr addrspace(5) %ptr syncscope("agent") seq_cst, align 4
   ret void
@@ -111,7 +105,6 @@ define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: cmpxchg_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
@@ -123,7 +116,7 @@ define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = cmpxchg ptr addrspace(5) %ptr, i32 0, i32 1 acq_rel monotonic
   %result.0 = extractvalue { i32, i1 } %result, 0
@@ -147,7 +140,6 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: cmpxchg_private_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 4, v2
 ; GCN-NEXT:    buffer_load_dword v1, v3, s[0:3], 0 offen
@@ -163,7 +155,7 @@ define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = cmpxchg ptr addrspace(5) %ptr, i64 0, i64 1 acq_rel monotonic
   %result.0 = extractvalue { i64, i1 } %result, 0
@@ -181,13 +173,12 @@ define i32 @atomicrmw_xchg_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_xchg_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v2, 4
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xchg ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -202,13 +193,12 @@ define i32 @atomicrmw_add_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_add_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw add ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -223,13 +213,12 @@ define i32 @atomicrmw_sub_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_sub_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, -4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw sub ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -244,13 +233,12 @@ define i32 @atomicrmw_and_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_and_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw and ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -266,14 +254,13 @@ define i32 @atomicrmw_nand_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_nand_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_not_b32_e32 v2, v1
 ; GCN-NEXT:    v_or_b32_e32 v2, -5, v2
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw nand ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -288,13 +275,12 @@ define i32 @atomicrmw_or_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_or_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw or ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -309,13 +295,12 @@ define i32 @atomicrmw_xor_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_xor_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_xor_b32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw xor ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -331,13 +316,12 @@ define i32 @atomicrmw_max_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_max_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_max_i32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw max ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -353,13 +337,12 @@ define i32 @atomicrmw_min_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_min_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_min_i32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw min ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -375,13 +358,12 @@ define i32 @atomicrmw_umax_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_umax_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_max_u32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw umax ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -397,13 +379,12 @@ define i32 @atomicrmw_umin_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_umin_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_min_u32_e32 v2, 4, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw umin ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -418,13 +399,12 @@ define float @atomicrmw_fadd_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_fadd_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v2, 2.0, v1
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(5) %ptr, float 2.0 seq_cst
   ret float %result
@@ -439,13 +419,12 @@ define float @atomicrmw_fsub_private_i32(ptr addrspace(5) %ptr, float %val) {
 ;
 ; GCN-LABEL: atomicrmw_fsub_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v1, v2, v1
 ; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fsub ptr addrspace(5) %ptr, float %val seq_cst
   ret float %result
@@ -541,7 +520,6 @@ define i32 @atomicrmw_inc_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_inc_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
@@ -549,7 +527,7 @@ define i32 @atomicrmw_inc_private_i32(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw uinc_wrap ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
@@ -568,7 +546,6 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
 ;
 ; GCN-LABEL: atomicrmw_dec_private_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
@@ -578,7 +555,7 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 4, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw udec_wrap ptr addrspace(5) %ptr, i32 4 seq_cst
   ret i32 %result
diff --git a/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll b/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
index 79bcaf83cec7c09..706a3b7518c8313 100644
--- a/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/prologue-epilogue-markers.ll
@@ -10,15 +10,13 @@ define hidden void @_Z9base_casev() #0 !dbg !6 {
 ; CHECK-LABEL: _Z9base_casev:
 ; CHECK:       .Lfunc_begin0:
 ; CHECK-NEXT:    .file 0 "dir" "file.cpp"
-; CHECK-NEXT:    .loc 0 5 0 ; file.cpp:5:0
 ; CHECK-NEXT:    .cfi_sections .debug_frame
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .loc 0 7 3 prologue_end ; file.cpp:7:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
-; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:  .Ltmp0:
 
 ; DWARFLINE:		file format elf64-amdgpu
 ; DWARFLINE:		.debug_line contents
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
index e29f09dcac0248a..d36dd29e1a76431 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-arg-dbg-value.ll
@@ -16,13 +16,12 @@ define hidden void @ptr_arg_split_subregs(ptr %arg1) #0 !dbg !9 {
 ; CHECK-NEXT:  ; %bb.0:
 ; CHECK-NEXT:    ;DEBUG_VALUE: ptr_arg_split_subregs:a <- [DW_OP_LLVM_fragment 32 32] [$vgpr1+0]
 ; CHECK-NEXT:    ;DEBUG_VALUE: ptr_arg_split_subregs:a <- [DW_OP_LLVM_fragment 0 32] [$vgpr0+0]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    .loc 1 7 13 prologue_end ; example.cpp:7:13
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2 offset:396
 ; CHECK-NEXT:    .loc 1 8 5 ; example.cpp:8:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp1:
   call void @llvm.dbg.declare(metadata ptr %arg1, metadata !20, metadata !DIExpression()), !dbg !21
@@ -44,7 +43,6 @@ define hidden void @ptr_arg_split_reg_mem(<30 x i32>, ptr %arg2) #0 !dbg !25 {
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
 ; CHECK-NEXT:    ;DEBUG_VALUE: ptr_arg_split_reg_mem:b <- [$vgpr30+0]
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:  .Ltmp2:
@@ -52,7 +50,7 @@ define hidden void @ptr_arg_split_reg_mem(<30 x i32>, ptr %arg2) #0 !dbg !25 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dword v[30:31], v0 offset:396
 ; CHECK-NEXT:    .loc 1 13 5 ; example.cpp:13:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp3:
   call void @llvm.dbg.declare(metadata ptr %arg2, metadata !26, metadata !DIExpression()), !dbg !27
@@ -69,7 +67,6 @@ define hidden void @ptr_arg_in_memory(<32 x i32>, ptr %arg3) #0 !dbg !31 {
 ; CHECK-NEXT:    .loc 1 15 0 ; example.cpp:15:0
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 1
@@ -78,7 +75,7 @@ define hidden void @ptr_arg_in_memory(<32 x i32>, ptr %arg3) #0 !dbg !31 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2 offset:396
 ; CHECK-NEXT:    .loc 1 18 5 ; example.cpp:18:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 ; CHECK-NEXT:  .Ltmp5:
   call void @llvm.dbg.declare(metadata ptr %arg3, metadata !32, metadata !DIExpression()), !dbg !33
diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
index 70622706789331f..77b338c193fee8a 100644
--- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll
@@ -6,16 +6,16 @@
 define ptr addrspace(1) @v_ptrmask_global_variable_i64(ptr addrspace(1) %ptr, i64 %mask) {
 ; GCN-LABEL: v_ptrmask_global_variable_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, v1, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ptrmask_global_variable_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %masked = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) %ptr, i64 %mask)
   ret ptr addrspace(1) %masked
@@ -24,14 +24,14 @@ define ptr addrspace(1) @v_ptrmask_global_variable_i64(ptr addrspace(1) %ptr, i6
 define ptr addrspace(3) @v_ptrmask_local_variable_i32(ptr addrspace(3) %ptr, i32 %mask) {
 ; GCN-LABEL: v_ptrmask_local_variable_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ptrmask_local_variable_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %masked = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) %ptr, i32 %mask)
   ret ptr addrspace(3) %masked
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 8df1f76d2e327d4..bd68442fb82be49 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -7,7 +7,6 @@
 define float @v_rcp_f32_ieee(float %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -19,11 +18,11 @@ define float @v_rcp_f32_ieee(float %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -35,6 +34,7 @@ define float @v_rcp_f32_ieee(float %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee:
@@ -48,8 +48,8 @@ define float @v_rcp_f32_ieee(float %x) #3 {
 define float @v_rcp_f32_ieee_unsafe(float %x) #4 {
 ; GCN-LABEL: v_rcp_f32_ieee_unsafe:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_unsafe:
@@ -63,7 +63,6 @@ define float @v_rcp_f32_ieee_unsafe(float %x) #4 {
 define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee_known_not_denormal:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -75,11 +74,11 @@ define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee_known_not_denormal:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -91,6 +90,7 @@ define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_known_not_denormal:
@@ -104,7 +104,6 @@ define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 {
 define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee_nnan_ninf:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -116,11 +115,11 @@ define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee_nnan_ninf:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -132,6 +131,7 @@ define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_nnan_ninf:
@@ -145,7 +145,6 @@ define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 {
 define float @v_neg_rcp_f32_ieee(float %x) #3 {
 ; SI-LABEL: v_neg_rcp_f32_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
@@ -157,11 +156,11 @@ define float @v_neg_rcp_f32_ieee(float %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v1, v4, v3
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_neg_rcp_f32_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -173,6 +172,7 @@ define float @v_neg_rcp_f32_ieee(float %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_neg_rcp_f32_ieee:
@@ -186,7 +186,6 @@ define float @v_neg_rcp_f32_ieee(float %x) #3 {
 define float @v_rcp_f32_daz(float %x) #0 {
 ; SI-LABEL: v_rcp_f32_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -200,11 +199,11 @@ define float @v_rcp_f32_daz(float %x) #0 {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -218,6 +217,7 @@ define float @v_rcp_f32_daz(float %x) #0 {
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_daz:
@@ -231,7 +231,6 @@ define float @v_rcp_f32_daz(float %x) #0 {
 define float @v_neg_rcp_f32_daz(float %x) #0 {
 ; SI-LABEL: v_neg_rcp_f32_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
 ; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
@@ -245,11 +244,11 @@ define float @v_neg_rcp_f32_daz(float %x) #0 {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_neg_rcp_f32_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
 ; VI-NEXT:    v_div_scale_f32 v2, vcc, -1.0, v0, -1.0
 ; VI-NEXT:    v_rcp_f32_e32 v3, v1
@@ -263,6 +262,7 @@ define float @v_neg_rcp_f32_daz(float %x) #0 {
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_neg_rcp_f32_daz:
@@ -276,7 +276,6 @@ define float @v_neg_rcp_f32_daz(float %x) #0 {
 define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee_ulp25:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -285,16 +284,17 @@ define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee_ulp25:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_ulp25:
@@ -308,7 +308,6 @@ define float @v_rcp_f32_ieee_ulp25(float %x) #3 {
 define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -317,16 +316,17 @@ define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal:
@@ -340,7 +340,6 @@ define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #
 define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 {
 ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
 ; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
@@ -349,16 +348,17 @@ define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal:
@@ -372,7 +372,6 @@ define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %
 define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
 ; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; SI-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
@@ -381,16 +380,17 @@ define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e32 v1, v0
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan:
@@ -404,8 +404,8 @@ define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 {
 define float @v_rcp_f32_daz_ulp25(float %x) #0 {
 ; GCN-LABEL: v_rcp_f32_daz_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_f32_daz_ulp25:
@@ -419,7 +419,6 @@ define float @v_rcp_f32_daz_ulp25(float %x) #0 {
 define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
 ; SI-LABEL: v_neg_rcp_f32_ieee_ulp25:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
 ; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
@@ -428,16 +427,17 @@ define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_neg_rcp_f32_ieee_ulp25:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, -v0
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_neg_rcp_f32_ieee_ulp25:
@@ -451,8 +451,8 @@ define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 {
 define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 {
 ; GCN-LABEL: v_neg_rcp_f32_daz_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_neg_rcp_f32_daz_ulp25:
@@ -466,7 +466,6 @@ define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 {
 define float @v_rcp_fabs_f32_ieee(float %x) #3 {
 ; SI-LABEL: v_rcp_fabs_f32_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v3, v2
@@ -479,11 +478,11 @@ define float @v_rcp_fabs_f32_ieee(float %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_fabs_f32_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
 ; VI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
@@ -496,6 +495,7 @@ define float @v_rcp_fabs_f32_ieee(float %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_fabs_f32_ieee:
@@ -510,7 +510,6 @@ define float @v_rcp_fabs_f32_ieee(float %x) #3 {
 define float @v_rcp_fabs_f32_daz(float %x) #0 {
 ; SI-LABEL: v_rcp_fabs_f32_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
 ; SI-NEXT:    v_rcp_f32_e32 v3, v2
@@ -525,11 +524,11 @@ define float @v_rcp_fabs_f32_daz(float %x) #0 {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_fabs_f32_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
 ; VI-NEXT:    v_div_scale_f32 v1, vcc, 1.0, v1, 1.0
@@ -544,6 +543,7 @@ define float @v_rcp_fabs_f32_daz(float %x) #0 {
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, 1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_fabs_f32_daz:
@@ -558,7 +558,6 @@ define float @v_rcp_fabs_f32_daz(float %x) #0 {
 define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
 ; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e64 v1, |v0|
 ; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
@@ -567,16 +566,17 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, |v0|
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25:
@@ -591,8 +591,8 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
 define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 {
 ; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e64 v0, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_fabs_f32_daz_ulp25:
@@ -607,7 +607,6 @@ define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 {
 define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
 ; SI-LABEL: v_rcp_neg_fabs_f32_ieee:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
 ; SI-NEXT:    v_rcp_f32_e32 v3, v2
@@ -620,11 +619,11 @@ define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
 ; SI-NEXT:    v_fma_f32 v1, -v2, v4, v1
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_neg_fabs_f32_ieee:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
 ; VI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
@@ -637,6 +636,7 @@ define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
 ; VI-NEXT:    v_fma_f32 v1, -v2, v4, v1
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_neg_fabs_f32_ieee:
@@ -651,7 +651,6 @@ define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 {
 define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
 ; SI-LABEL: v_rcp_neg_fabs_f32_daz:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
 ; SI-NEXT:    v_rcp_f32_e32 v3, v2
@@ -666,11 +665,11 @@ define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
 ; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; SI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; SI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_neg_fabs_f32_daz:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
 ; VI-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
 ; VI-NEXT:    v_div_scale_f32 v1, vcc, -1.0, v1, -1.0
@@ -685,6 +684,7 @@ define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
 ; VI-NEXT:    v_div_fixup_f32 v0, v1, |v0|, -1.0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_neg_fabs_f32_daz:
@@ -699,7 +699,6 @@ define float @v_rcp_neg_fabs_f32_daz(float %x) #0 {
 define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
 ; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x7f800000
 ; SI-NEXT:    v_frexp_mant_f32_e64 v1, -|v0|
 ; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
@@ -708,16 +707,17 @@ define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
 ; SI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_frexp_mant_f32_e64 v1, -|v0|
 ; VI-NEXT:    v_rcp_f32_e32 v1, v1
 ; VI-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0, v0
 ; VI-NEXT:    v_ldexp_f32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
@@ -732,8 +732,8 @@ define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
 define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 {
 ; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index 536b2d054272ee1..8e0df785fef121b 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -13,28 +13,28 @@
 define i32 @v_mul_add_1_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_1_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul = mul i32 %x, %add
@@ -44,28 +44,28 @@ define i32 @v_mul_add_1_i32(i32 %x, i32 %y) {
 define i32 @v_mul_add_1_i32_commute(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_1_i32_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul = mul i32 %add, %x
@@ -75,28 +75,28 @@ define i32 @v_mul_add_1_i32_commute(i32 %x, i32 %y) {
 define i32 @v_mul_add_x_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_x_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_x_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %x, %y
   %add = add i32 %x, %mul
@@ -106,30 +106,30 @@ define i32 @v_mul_add_x_i32(i32 %x, i32 %y) {
 define i32 @v_mul_sub_1_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_sub_1_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %y, 1
   %mul = mul i32 %x, %sub
@@ -139,30 +139,30 @@ define i32 @v_mul_sub_1_i32(i32 %x, i32 %y) {
 define i32 @v_mul_sub_1_i32_commute(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_sub_1_i32_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i32_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i32_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i32_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %y, 1
   %mul = mul i32 %sub, %x
@@ -172,30 +172,30 @@ define i32 @v_mul_sub_1_i32_commute(i32 %x, i32 %y) {
 define i32 @v_mul_sub_x_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_sub_x_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_x_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %x, %y
   %sub = sub i32 %mul, %x
@@ -205,30 +205,30 @@ define i32 @v_mul_sub_x_i32(i32 %x, i32 %y) {
 define i32 @v_mul_add_2_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_2_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 2, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 2
   %mul = mul i32 %x, %add
@@ -238,30 +238,30 @@ define i32 @v_mul_add_2_i32(i32 %x, i32 %y) {
 define i32 @v_mul_sub_2_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_sub_2_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -2, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -2, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -2, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %y, 2
   %mul = mul i32 %x, %sub
@@ -271,30 +271,30 @@ define i32 @v_mul_sub_2_i32(i32 %x, i32 %y) {
 define i32 @v_mul_add_65_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_add_65_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 0x41, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_65_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x41, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_65_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x41, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_65_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x41, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 65
   %mul = mul i32 %x, %add
@@ -304,30 +304,30 @@ define i32 @v_mul_add_65_i32(i32 %x, i32 %y) {
 define i32 @v_mul_sub_65_i32(i32 %x, i32 %y) {
 ; GFX67-LABEL: v_mul_sub_65_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffbf, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_65_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xffffffbf, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_65_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0xffffffbf, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_65_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0xffffffbf, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %y, 65
   %mul = mul i32 %x, %sub
@@ -337,26 +337,26 @@ define i32 @v_mul_sub_65_i32(i32 %x, i32 %y) {
 define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i24_zext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i24_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i24_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i24_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i24 %y, 1
   %mul = mul i24 %x, %add
@@ -366,30 +366,30 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
 define i24 @v_mul_sub_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
 ; GFX67-LABEL: v_mul_sub_1_i24_zext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i24_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i24_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i24_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i24 %y, 1
   %mul = mul i24 %x, %sub
@@ -399,26 +399,26 @@ define i24 @v_mul_sub_1_i24_zext(i24 zeroext %x, i24 zeroext %y) {
 define i24 @v_add_mul_i24_zext_1(i24 zeroext %x, i24 zeroext %y) {
 ; GFX67-LABEL: v_add_mul_i24_zext_1:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_mul_i24_zext_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_mul_i24_zext_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_mul_i24_zext_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i24 %x, %y
   %add = add i24 %mul, %x
@@ -428,26 +428,26 @@ define i24 @v_add_mul_i24_zext_1(i24 zeroext %x, i24 zeroext %y) {
 define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
 ; GFX67-LABEL: v_mul_add_1_i24_sext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i24_sext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i24_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i24_sext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i24 %y, 1
   %mul = mul i24 %x, %add
@@ -457,26 +457,26 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) {
 define i24 @v_add_mul_i24_sext_1(i24 signext %x, i24 signext %y) {
 ; GFX67-LABEL: v_add_mul_i24_sext_1:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_add_mul_i24_sext_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_mul_i24_sext_1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_mul_i24_sext_1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i24 %x, %y
   %add = add i24 %mul, %x
@@ -486,30 +486,30 @@ define i24 @v_add_mul_i24_sext_1(i24 signext %x, i24 signext %y) {
 define i24 @v_mul_sub_1_i24_sext(i24 signext %x, i24 signext %y) {
 ; GFX67-LABEL: v_mul_sub_1_i24_sext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i24_sext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i24_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i24_sext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i24 %y, 1
   %mul = mul i24 %x, %sub
@@ -519,28 +519,28 @@ define i24 @v_mul_sub_1_i24_sext(i24 signext %x, i24 signext %y) {
 define i25 @v_mul_add_1_i25_zext(i25 zeroext %x, i25 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i25_zext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i25_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i25_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i25_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i25 %y, 1
   %mul = mul i25 %x, %add
@@ -550,30 +550,30 @@ define i25 @v_mul_add_1_i25_zext(i25 zeroext %x, i25 zeroext %y) {
 define i25 @v_mul_sub_1_i25_zext(i25 zeroext %x, i25 zeroext %y) {
 ; GFX67-LABEL: v_mul_sub_1_i25_zext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 0x1ffffff, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i25_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x1ffffff, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i25_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x1ffffff, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i25_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x1ffffff, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i25 %y, 1
   %mul = mul i25 %x, %sub
@@ -583,28 +583,28 @@ define i25 @v_mul_sub_1_i25_zext(i25 zeroext %x, i25 zeroext %y) {
 define i25 @v_mul_add_1_i25_sext(i25 signext %x, i25 signext %y) {
 ; GFX67-LABEL: v_mul_add_1_i25_sext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i25_sext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i25_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i25_sext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i25 %y, 1
   %mul = mul i25 %x, %add
@@ -614,30 +614,30 @@ define i25 @v_mul_add_1_i25_sext(i25 signext %x, i25 signext %y) {
 define i25 @v_mul_sub_1_i25_sext(i25 signext %x, i25 signext %y) {
 ; GFX67-LABEL: v_mul_sub_1_i25_sext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 0x1ffffff, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i25_sext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x1ffffff, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i25_sext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x1ffffff, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i25_sext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x1ffffff, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i25 %y, 1
   %mul = mul i25 %x, %sub
@@ -647,29 +647,29 @@ define i25 @v_mul_sub_1_i25_sext(i25 signext %x, i25 signext %y) {
 define i16 @v_mul_add_1_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_add_1_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
   %mul = mul i16 %x, %add
@@ -679,31 +679,31 @@ define i16 @v_mul_add_1_i16(i16 %x, i16 %y) {
 define i32 @v_mul_add_1_i16_zext_result(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16_zext_result:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
   %mul = mul i16 %x, %add
@@ -714,29 +714,29 @@ define i32 @v_mul_add_1_i16_zext_result(i16 %x, i16 %y) {
 define i16 @v_mul_add_1_i16_commute(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_add_1_i16_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i16_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i16_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i16_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 1
   %mul = mul i16 %add, %x
@@ -746,28 +746,28 @@ define i16 @v_mul_add_1_i16_commute(i16 %x, i16 %y) {
 define i16 @v_mul_add_x_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_add_x_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v2, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_x_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %x, %y
   %add = add i16 %x, %mul
@@ -777,32 +777,32 @@ define i16 @v_mul_add_x_i16(i16 %x, i16 %y) {
 define i16 @v_mul_sub_1_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_sub_1_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v1, -1, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v1, v1, -1
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i16 %y, 1
   %mul = mul i16 %x, %sub
@@ -812,32 +812,32 @@ define i16 @v_mul_sub_1_i16(i16 %x, i16 %y) {
 define i16 @v_mul_sub_1_i16_commute(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_sub_1_i16_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i16_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v1, -1, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i16_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v1, -1, v1
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i16_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v1, v1, -1
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i16 %y, 1
   %mul = mul i16 %sub, %x
@@ -847,32 +847,32 @@ define i16 @v_mul_sub_1_i16_commute(i16 %x, i16 %y) {
 define i16 @v_mul_sub_x_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_sub_x_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
 ; GFX67-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_x_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v0, v1
 ; GFX9-NEXT:    v_sub_u16_e32 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u16 v1, v0, v1
 ; GFX10-NEXT:    v_sub_nc_u16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %x, %y
   %sub = sub i16 %mul, %x
@@ -882,32 +882,32 @@ define i16 @v_mul_sub_x_i16(i16 %x, i16 %y) {
 define i16 @v_mul_add_2_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_add_2_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v1, 2, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v1, 2, v1
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v1, v1, 2
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i16 %y, 2
   %mul = mul i16 %x, %add
@@ -917,32 +917,32 @@ define i16 @v_mul_add_2_i16(i16 %x, i16 %y) {
 define i16 @v_mul_sub_2_i16(i16 %x, i16 %y) {
 ; GFX67-LABEL: v_mul_sub_2_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, -2, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e32 v1, -2, v1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v1, -2, v1
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v1, v1, -2
 ; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i16 %y, 2
   %mul = mul i16 %x, %sub
@@ -952,7 +952,6 @@ define i16 @v_mul_sub_2_i16(i16 %x, i16 %y) {
 define i64 @v_mul_add_1_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_1_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
@@ -961,48 +960,49 @@ define i64 @v_mul_add_1_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_1_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %y, 1
   %mul = mul i64 %x, %add
@@ -1012,7 +1012,6 @@ define i64 @v_mul_add_1_i64(i64 %x, i64 %y) {
 define i64 @v_mul_add_1_i64_commute(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_1_i64_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
@@ -1021,48 +1020,49 @@ define i64 @v_mul_add_1_i64_commute(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_1_i64_commute:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i64_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i64_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i64_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %y, 1
   %mul = mul i64 %add, %x
@@ -1072,7 +1072,6 @@ define i64 @v_mul_add_1_i64_commute(i64 %x, i64 %y) {
 define i64 @v_mul_add_x_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_x_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
@@ -1081,48 +1080,49 @@ define i64 @v_mul_add_x_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_x_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX7-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_x_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %x, %y
   %add = add i64 %x, %mul
@@ -1132,7 +1132,6 @@ define i64 @v_mul_add_x_i64(i64 %x, i64 %y) {
 define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_sub_1_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
@@ -1141,11 +1140,11 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_sub_1_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
@@ -1154,11 +1153,11 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
@@ -1167,28 +1166,29 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, 0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i64 %y, 1
   %mul = mul i64 %x, %sub
@@ -1198,7 +1198,6 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) {
 define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_sub_1_i64_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
@@ -1207,11 +1206,11 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_sub_1_i64_commute:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, -1, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v2, vcc, -1, v3, vcc
@@ -1220,11 +1219,11 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_i64_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, -1, v3, vcc
@@ -1233,28 +1232,29 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_i64_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, -1, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_i64_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, 0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i64 %y, 1
   %mul = mul i64 %sub, %x
@@ -1264,7 +1264,6 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) {
 define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_sub_x_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v1, v2
@@ -1273,11 +1272,11 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_sub_x_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v2, 0
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v1, v2
@@ -1285,11 +1284,11 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX7-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
 ; GFX7-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v2, 0
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v2
@@ -1297,28 +1296,29 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_x_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v2, 0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v5, v4
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v2, 0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v5, v4
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v2, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %x, %y
   %sub = sub i64 %mul, %x
@@ -1328,7 +1328,6 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) {
 define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_add_2_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
@@ -1337,11 +1336,11 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_add_2_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1350,11 +1349,11 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1363,28 +1362,29 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, 0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %y, 2
   %mul = mul i64 %x, %add
@@ -1394,7 +1394,6 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) {
 define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
 ; GFX6-LABEL: v_mul_sub_2_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, -2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v3
@@ -1403,11 +1402,11 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_sub_2_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, -2, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
@@ -1416,11 +1415,11 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v3, vcc
@@ -1429,28 +1428,29 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, -2, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, -2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v4, v1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v0, v3
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, 0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i64 %y, 2
   %mul = mul i64 %x, %sub
@@ -1460,45 +1460,45 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) {
 define <2 x i32> @v_mul_add_1_i32_multiple(i32 %x, i32 %y, i32 %z) {
 ; GFX67-LABEL: v_mul_add_1_i32_multiple:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v0, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_multiple:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v0, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_add_1_i32_multiple:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v3, v[0:1]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v3, v[2:3]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_add_1_i32_multiple:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v3, v[0:1]
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, v[2:3]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_multiple:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v3, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, v3, v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul0 = mul i32 %x, %add
@@ -1511,30 +1511,30 @@ define <2 x i32> @v_mul_add_1_i32_multiple(i32 %x, i32 %y, i32 %z) {
 define <2 x i32> @v_mul_add_1_i32_other_use(i32 %x, i32 %y, i32 %z) {
 ; GFX67-LABEL: v_mul_add_1_i32_other_use:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_other_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 1, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32_other_use:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_other_use:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %y, 1
   %mul0 = mul i32 %x, %add
@@ -1547,44 +1547,44 @@ define <2 x i32> @v_mul_add_1_i32_other_use(i32 %x, i32 %y, i32 %z) {
 define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 ; GFX67-LABEL: v_mul_add_1_i32_chain:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i32_chain:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i32_chain:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i32_chain:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %i2 = add i32 %arg0, 1
   %i3 = mul i32 %i2, %arg1
@@ -1597,7 +1597,6 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) {
 define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1606,31 +1605,32 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 1, i16 1>
   %mul = mul <2 x i16> %x, %add
@@ -1640,7 +1640,6 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1649,31 +1648,32 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i16_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 1, i16 1>
   %mul = mul <2 x i16> %add, %x
@@ -1683,7 +1683,6 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_add_x_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-NEXT:    v_and_b32_e32 v5, 0xffff, v1
@@ -1694,31 +1693,32 @@ define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_x_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %x, %y
   %add = add <2 x i16> %x, %mul
@@ -1728,7 +1728,6 @@ define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1737,31 +1736,32 @@ define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    v_add_u16_e32 v2, -1, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %y, <i16 1, i16 1>
   %mul = mul <2 x i16> %x, %sub
@@ -1771,7 +1771,6 @@ define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i16_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1780,31 +1779,32 @@ define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i16_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -1
 ; GFX8-NEXT:    v_add_u16_e32 v2, -1, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i16_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i16_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %y, <i16 1, i16 1>
   %mul = mul <2 x i16> %sub, %x
@@ -1814,7 +1814,6 @@ define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_sub_x_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v4, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v2, v4, v2
@@ -1827,31 +1826,32 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_sub_u16_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_x_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %x, %y
   %sub = sub <2 x i16> %mul, %x
@@ -1861,7 +1861,6 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_add_2_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 2, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1870,31 +1869,32 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX8-NEXT:    v_add_u16_e32 v2, 2, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_u16 v1, v1, -2 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i16> %y, <i16 2, i16 2>
   %mul = mul <2 x i16> %x, %add
@@ -1904,7 +1904,6 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-LABEL: v_mul_sub_2_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -2, v2
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -2, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -1913,31 +1912,32 @@ define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, -2
 ; GFX8-NEXT:    v_add_u16_e32 v2, -2, v1
 ; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, 2 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_sub_i16 v1, v1, 2 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i16> %y, <i16 2, i16 2>
   %mul = mul <2 x i16> %x, %sub
@@ -1947,45 +1947,45 @@ define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) {
 define <2 x i32> @v_mul_add_1_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_add_1_v2i32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v3, v[1:2]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_add_1_v2i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v3, v[1:2]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %x, %add
@@ -1995,45 +1995,45 @@ define <2 x i32> @v_mul_add_1_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_add_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v3, v[1:2]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i32_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v3, v[1:2]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %add, %x
@@ -2043,45 +2043,45 @@ define <2 x i32> @v_mul_add_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_add_x_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_x_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_add_x_v2i32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v2, v[0:1]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v1, v3, v[1:2]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v4
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_add_x_v2i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[0:1]
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v3, v[1:2]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i32> %x, %y
   %add = add <2 x i32> %x, %mul
@@ -2091,38 +2091,38 @@ define <2 x i32> @v_mul_add_x_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_sub_1_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %x, %sub
@@ -2132,38 +2132,38 @@ define <2 x i32> @v_mul_sub_1_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_sub_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i32_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i32_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i32_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i32_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v2, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i32> %y, <i32 1, i32 1>
   %mul = mul <2 x i32> %sub, %x
@@ -2173,47 +2173,47 @@ define <2 x i32> @v_mul_sub_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_sub_x_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_sub_x_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX67-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX67-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_sub_x_v2i32:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX900-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX900-NEXT:    v_sub_u32_e32 v0, v2, v0
 ; GFX900-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_sub_x_v2i32:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX90A-NEXT:    v_sub_u32_e32 v0, v2, v0
 ; GFX90A-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i32> %x, %y
   %sub = sub <2 x i32> %mul, %x
@@ -2223,38 +2223,38 @@ define <2 x i32> @v_mul_sub_x_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_add_2_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_add_2_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 2, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, 2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i32> %y, <i32 2, i32 2>
   %mul = mul <2 x i32> %x, %add
@@ -2264,38 +2264,38 @@ define <2 x i32> @v_mul_add_2_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i32> @v_mul_sub_2_v2i32(<2 x i32> %x, <2 x i32> %y) {
 ; GFX67-LABEL: v_mul_sub_2_v2i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -2, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -2, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -2, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i32> %y, <i32 2, i32 2>
   %mul = mul <2 x i32> %x, %sub
@@ -2305,30 +2305,30 @@ define <2 x i32> @v_mul_sub_2_v2i32(<2 x i32> %x, <2 x i32> %y) {
 define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX67-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX10-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %x, %add
@@ -2338,30 +2338,30 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX67-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i24_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX10-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %add, %x
@@ -2371,30 +2371,30 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_add_x_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_x_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX67-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_x_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_x_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_x_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v2, v0
 ; GFX10-NEXT:    v_mad_u32_u24 v1, v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i24> %x, %y
   %add = add <2 x i24> %x, %mul
@@ -2404,38 +2404,38 @@ define <2 x i24> @v_mul_add_x_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_sub_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %x, %sub
@@ -2445,38 +2445,38 @@ define <2 x i24> @v_mul_sub_1_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_sub_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_sub_1_v2i24_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -1, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_1_v2i24_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_1_v2i24_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -1, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_1_v2i24_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i24> %y, <i24 1, i24 1>
   %mul = mul <2 x i24> %sub, %x
@@ -2486,38 +2486,38 @@ define <2 x i24> @v_mul_sub_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_sub_x_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_sub_x_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v3, v1, v3
 ; GFX67-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GFX67-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_x_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v3, v1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_x_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v1, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_x_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v3, v1, v3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i24> %x, %y
   %sub = sub <2 x i24> %mul, %x
@@ -2527,38 +2527,38 @@ define <2 x i24> @v_mul_sub_x_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_add_2_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_add_2_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, 2, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_2_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 2, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_2_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, 2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_2_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, 2, v3
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i24> %y, <i24 2, i24 2>
   %mul = mul <2 x i24> %x, %add
@@ -2568,38 +2568,38 @@ define <2 x i24> @v_mul_add_2_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define <2 x i24> @v_mul_sub_2_v2i24(<2 x i24> %x, <2 x i24> %y) {
 ; GFX67-LABEL: v_mul_sub_2_v2i24:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v3, vcc, -2, v3
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, -2, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_sub_2_v2i24:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, -2, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, -2, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_sub_2_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v3, -2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, -2, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_sub_2_v2i24:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, -2, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -2, v3
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub <2 x i24> %y, <i24 2, i24 2>
   %mul = mul <2 x i24> %x, %sub
@@ -2609,28 +2609,28 @@ define <2 x i24> @v_mul_sub_2_v2i24(<2 x i24> %x, <2 x i24> %y) {
 define i32 @v_mul_9_add_52_i32(i32 %arg) {
 ; GFX67-LABEL: v_mul_9_add_52_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, 9
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 9
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 52, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_9_add_52_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 9, 52
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %arg, 9
   %add = add i32 %mul, 52
@@ -2640,27 +2640,27 @@ define i32 @v_mul_9_add_52_i32(i32 %arg) {
 define i16 @v_mul_9_add_52_i16(i16 %arg) {
 ; GFX67-LABEL: v_mul_9_add_52_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 9, 52
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, 9, 52
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_9_add_52_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, 9, 52
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, 9, 52
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg, 9
   %add = add i16 %mul, 52
@@ -2670,7 +2670,6 @@ define i16 @v_mul_9_add_52_i16(i16 %arg) {
 define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
 ; GFX67-LABEL: v_mul_9_add_52_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mad_u32_u24 v1, v1, 9, 52
@@ -2679,30 +2678,31 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_mad_u16 v1, v1, 9, 52
 ; GFX8-NEXT:    v_mad_u16 v0, v0, 9, 52
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_9_add_52_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, 9 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 52 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, 9 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 52 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %arg, <i16 9, i16 9>
   %add = add <2 x i16> %mul, <i16 52, i16 52>
@@ -2712,55 +2712,55 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) {
 define i64 @v_mul_9_add_52_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_9_add_52_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 9
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, 9
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 9
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 52, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_9_add_52_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v1, 9
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_9_add_52_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, 9
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_9_add_52_i64:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, 9, v[1:2]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_9_add_52_i64:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 9, 52
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 9, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_9_add_52_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 9, 52
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, 9, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 9
   %add = add i64 %mul, 52
@@ -2770,28 +2770,28 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) {
 define i32 @v_mul_5_add_1_i32(i32 %arg) {
 ; GFX67-LABEL: v_mul_5_add_1_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, 5
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_5_add_1_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 5, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %arg, 5
   %add = add i32 %mul, 1
@@ -2801,33 +2801,33 @@ define i32 @v_mul_5_add_1_i32(i32 %arg) {
 define i32 @v_mul_284_add_82_i32(i32 %arg) {
 ; GFX67-LABEL: v_mul_284_add_82_i32:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX67-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX67-NEXT:    v_add_i32_e32 v0, vcc, 0x52, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x52, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_284_add_82_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x52, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_284_add_82_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x52
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i32 %arg, 284
   %add = add i32 %mul, 82
@@ -2837,27 +2837,27 @@ define i32 @v_mul_284_add_82_i32(i32 %arg) {
 define i16 @v_mul_5_add_1_i16(i16 %arg) {
 ; GFX67-LABEL: v_mul_5_add_1_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, 5, 1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, 5, 1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_5_add_1_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, 5, 1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, 5, 1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg, 5
   %add = add i16 %mul, 1
@@ -2867,34 +2867,34 @@ define i16 @v_mul_5_add_1_i16(i16 %arg) {
 define i16 @v_mul_284_add_82_i16(i16 %arg) {
 ; GFX67-LABEL: v_mul_284_add_82_i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX67-NEXT:    v_mov_b32_e32 v1, 0x52
 ; GFX67-NEXT:    v_mad_u32_u24 v0, v0, s4, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x52
 ; GFX8-NEXT:    v_mad_u16 v0, v0, s4, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_284_add_82_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x52
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, s4, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_284_add_82_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX10-NEXT:    v_mad_u16 v0, v0, s4, 0x52
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg, 284
   %add = add i16 %mul, 82
@@ -2904,7 +2904,6 @@ define i16 @v_mul_284_add_82_i16(i16 %arg) {
 define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 ; GFX67-LABEL: v_mul_5_add_1_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    v_mad_u32_u24 v1, v1, 5, 1
@@ -2913,30 +2912,31 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    v_mad_u16 v1, v1, 5, 1
 ; GFX8-NEXT:    v_mad_u16 v0, v0, 5, 1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_5_add_1_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, 5 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %arg, <i16 5, i16 5>
   %add = add <2 x i16> %mul, <i16 1, i16 1>
@@ -2946,7 +2946,6 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) {
 define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) {
 ; GFX67-LABEL: v_mul_284_add_82_v2i16:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-NEXT:    s_movk_i32 s4, 0x11c
@@ -2957,11 +2956,11 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) {
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xfffe, v0
 ; GFX67-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xfffe, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX8-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0x52
@@ -2969,22 +2968,23 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) {
 ; GFX8-NEXT:    v_mad_u16 v0, v0, s4, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_284_add_82_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0]
 ; GFX9-NEXT:    s_movk_i32 s4, 0x52
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_284_add_82_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, 0x11c, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, 0x52, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul <2 x i16> %arg, <i16 284, i16 284>
   %add = add <2 x i16> %mul, <i16 82, i16 82>
@@ -2994,55 +2994,55 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) {
 define i64 @v_mul_5_add_1_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_5_add_1_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 5
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, 5
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_5_add_1_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_lo_u32 v2, v1, 5
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_5_add_1_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v1, 5
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_5_add_1_i64:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, 5, v[1:2]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_5_add_1_i64:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, 5, 1
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, 5, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_5_add_1_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, 5, 1
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, v2, 5, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 5
   %add = add i64 %mul, 1
@@ -3052,7 +3052,6 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) {
 define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_284_add_82_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s4
@@ -3060,44 +3059,44 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0x52, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_284_add_82_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0x52
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x52
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_284_add_82_i64:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x52
 ; GFX900-NEXT:    s_movk_i32 s6, 0x11c
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_284_add_82_i64:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x52
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x11c
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
@@ -3106,15 +3105,16 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_284_add_82_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x52
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 284
   %add = add i64 %mul, 82
@@ -3124,7 +3124,6 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX6-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s4
@@ -3132,44 +3131,44 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0x7da667, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7da667
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
 ; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7da667
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7da667
 ; GFX900-NEXT:    s_mov_b32 s6, 0x37b4a145
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x7da667
 ; GFX90A-NEXT:    s_mov_b32 s6, 0x37b4a145
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
@@ -3178,15 +3177,16 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v2
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x7da667
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 934584645
   %add = add i64 %mul, 8234599
@@ -3467,29 +3467,29 @@ define amdgpu_ps i32 @s_mul_add_1_i32_commute(i32 inreg %x, i32 inreg %y) {
 define i8 @v_mul_add_1_i8(i8 %x, i8 %y) {
 ; GFX67-LABEL: v_mul_add_1_i8:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %x, %add
@@ -3499,29 +3499,29 @@ define i8 @v_mul_add_1_i8(i8 %x, i8 %y) {
 define i8 @v_mul_add_1_i8_commute(i8 %x, i8 %y) {
 ; GFX67-LABEL: v_mul_add_1_i8_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i8_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %add, %x
@@ -3531,28 +3531,28 @@ define i8 @v_mul_add_1_i8_commute(i8 %x, i8 %y) {
 define i8 @v_mul_add_1_i8_zext(i8 zeroext %x, i8 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i8_zext:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i8_zext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_zext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_zext:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %x, %add
@@ -3562,28 +3562,28 @@ define i8 @v_mul_add_1_i8_zext(i8 zeroext %x, i8 zeroext %y) {
 define i8 @v_mul_add_1_i8_zext_commute(i8 zeroext %x, i8 zeroext %y) {
 ; GFX67-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_i8_zext_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add i8 %y, 1
   %mul = mul i8 %add, %x
@@ -3593,7 +3593,6 @@ define i8 @v_mul_add_1_i8_zext_commute(i8 zeroext %x, i8 zeroext %y) {
 define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i8:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -3605,36 +3604,37 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v2, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i8> %y, <i8 1, i8 1>
   %mul = mul <2 x i8> %x, %add
@@ -3644,7 +3644,6 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) {
 define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX67:       ; %bb.0:
-; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
 ; GFX67-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX67-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -3656,36 +3655,37 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) {
 ; GFX67-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
 ; GFX67-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
+; GFX67-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX67-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
 ; GFX8-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v3, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v0, v0, v2, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_add_1_v2i8_commute:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mad_u16 v1, v1, v3, v1
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v2, v0
 ; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %add = add <2 x i8> %y, <i8 1, i8 1>
   %mul = mul <2 x i8> %add, %x
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 3a879e818af797b..d7aa7fd739a4395 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -28,17 +28,9 @@
 name:            tbuffer_store1
 body:             |
   bb.0:
-    ; OPT-LABEL: name: tbuffer_store1
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: tbuffer_store1
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: tbuffer_store1
+    ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -47,17 +39,9 @@ body:             |
 name:            tbuffer_store2
 body:             |
   bb.0:
-    ; OPT-LABEL: name: tbuffer_store2
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: tbuffer_store2
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: tbuffer_store2
+    ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+    ; CHECK-NEXT: S_ENDPGM 0
     TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
     S_ENDPGM 0
 ...
@@ -67,8 +51,7 @@ name:            flat_store
 body:             |
   bb.0:
     ; CHECK-LABEL: name: flat_store
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK: FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
     ; CHECK-NEXT: S_ENDPGM 0
     FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
@@ -78,19 +61,10 @@ body:             |
 name:            global_store
 body:             |
   bb.0:
-    ; OPT-LABEL: name: global_store
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    ; OPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: global_store
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    ; NOOPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: global_store
+    ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; CHECK-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
     S_ENDPGM 0
@@ -100,17 +74,9 @@ body:             |
 name:            buffer_store_format
 body:             |
   bb.0:
-    ; OPT-LABEL: name: buffer_store_format
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: buffer_store_format
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: buffer_store_format
+    ; CHECK: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -122,7 +88,6 @@ body:             |
     ; CHECK-LABEL: name: ds_write_b32
     ; CHECK: renamable $vgpr0 = IMPLICIT_DEF
     ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF
-    ; CHECK-NEXT: S_WAITCNT 0
     ; CHECK-NEXT: DS_WRITE_B32_gfx9 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0 = IMPLICIT_DEF
@@ -137,23 +102,12 @@ body:             |
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
 
-    ; OPT-LABEL: name: global_store_dword
-    ; OPT: liveins: $vgpr0, $sgpr0_sgpr1
-    ; OPT-NEXT: {{  $}}
-    ; OPT-NEXT: S_WAITCNT 0
-    ; OPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    ; OPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: global_store_dword
-    ; NOOPT: liveins: $vgpr0, $sgpr0_sgpr1
-    ; NOOPT-NEXT: {{  $}}
-    ; NOOPT-NEXT: S_WAITCNT 0
-    ; NOOPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    ; NOOPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: global_store_dword
+    ; CHECK: liveins: $vgpr0, $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
     GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
     S_ENDPGM 0
@@ -166,7 +120,6 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_WAITCNT 0
   ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -206,45 +159,23 @@ body:             |
 ---
 name:            multiple_basic_blocks2
 body:             |
-  ; OPT-LABEL: name: multiple_basic_blocks2
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 0
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
-  ;
-  ; NOOPT-LABEL: name: multiple_basic_blocks2
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 0
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.1:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; CHECK-LABEL: name: multiple_basic_blocks2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.2
 
@@ -268,65 +199,33 @@ body:             |
 ---
 name:            multiple_basic_blocks3
 body:             |
-  ; OPT-LABEL: name: multiple_basic_blocks3
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 0
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   successors: %bb.4(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_BRANCH %bb.4
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.3:
-  ; OPT-NEXT:   successors: %bb.4(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.4
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.4:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
-  ;
-  ; NOOPT-LABEL: name: multiple_basic_blocks3
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 0
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.1:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   successors: %bb.4(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_BRANCH %bb.4
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.3:
-  ; NOOPT-NEXT:   successors: %bb.4(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.4
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.4:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; CHECK-LABEL: name: multiple_basic_blocks3
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.2
 
@@ -362,7 +261,6 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_WAITCNT 0
   ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
@@ -395,47 +293,24 @@ body:             |
 ---
 name:            recursive_loop_vmem
 body:             |
-  ; OPT-LABEL: name: recursive_loop_vmem
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.1(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 0
-  ; OPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.1
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 1015
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-  ; OPT-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
-  ;
-  ; NOOPT-LABEL: name: recursive_loop_vmem
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.1(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 0
-  ; NOOPT-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.1
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.1:
-  ; NOOPT-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 1015
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
-  ; NOOPT-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; CHECK-LABEL: name: recursive_loop_vmem
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_WAITCNT 1015
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
 
@@ -458,17 +333,9 @@ body:             |
 name:            image_store
 body:             |
   bb.0:
-    ; OPT-LABEL: name: image_store
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: image_store
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: image_store
+    ; CHECK: IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+    ; CHECK-NEXT: S_ENDPGM 0
   IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
   S_ENDPGM 0
 ...
@@ -478,8 +345,7 @@ name:            scratch_store
 body:             |
   bb.0:
     ; CHECK-LABEL: name: scratch_store
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+    ; CHECK: renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
     ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
     ; CHECK-NEXT: S_ENDPGM 0
     renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
@@ -491,17 +357,9 @@ body:             |
 name:            buffer_atomic
 body:             |
   bb.0:
-    ; OPT-LABEL: name: buffer_atomic
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: buffer_atomic
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: buffer_atomic
+    ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+    ; CHECK-NEXT: S_ENDPGM 0
     BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
     S_ENDPGM 0
 ...
@@ -511,8 +369,7 @@ name:            flat_atomic
 body:             |
   bb.0:
     ; CHECK-LABEL: name: flat_atomic
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+    ; CHECK: renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
     ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
     S_ENDPGM 0
@@ -524,8 +381,7 @@ name:            global_atomic
 body:             |
   bb.0:
     ; CHECK-LABEL: name: global_atomic
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+    ; CHECK: renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
     S_ENDPGM 0
@@ -536,8 +392,7 @@ name:            image_atomic
 body:             |
   bb.0:
     ; CHECK-LABEL: name: image_atomic
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+    ; CHECK: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
     ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
     S_ENDPGM 0
@@ -548,8 +403,7 @@ name:            global_store_optnone
 body:             |
   bb.0:
     ; CHECK-LABEL: name: global_store_optnone
-    ; CHECK: S_WAITCNT 0
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; CHECK-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index cf9de9d349e1c02..991b2658a3a6a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -6,7 +6,6 @@
 define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -29,11 +28,11 @@ define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 ; GFX6-NEXT:    v_fma_f32 v4, -v5, v6, v4
 ; GFX6-NEXT:    v_div_fmas_f32 v3, v4, v3, v6
 ; GFX6-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v0
 ; GFX9-NEXT:    v_div_scale_f32 v4, s[4:5], v2, v2, v1
 ; GFX9-NEXT:    v_div_scale_f32 v5, vcc, v0, v2, v0
@@ -57,11 +56,11 @@ define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 ; GFX9-NEXT:    v_div_fmas_f32 v4, v4, v8, v10
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v3, v2, v0
 ; GFX9-NEXT:    v_div_fixup_f32 v1, v4, v2, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v2, v2, v0
 ; GFX11-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
 ; GFX11-NEXT:    v_div_scale_f32 v9, vcc_lo, v0, v2, v0
@@ -93,6 +92,7 @@ define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v3, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_div_fixup_f32 v1, v4, v2, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv float %x, %D
   %div1 = fdiv float %y, %D
@@ -104,7 +104,6 @@ define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-NEXT:    v_fma_f32 v5, -v3, v4, 1.0
@@ -118,11 +117,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #
 ; GFX6-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x2_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
 ; GFX9-NEXT:    v_rcp_f32_e32 v5, v3
@@ -136,11 +135,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #
 ; GFX9-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x2_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v2, v2, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v6, vcc_lo, 1.0, v2, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -160,6 +159,7 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D
   %div1 = fdiv arcp float %y, %D
@@ -171,7 +171,6 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp(float %x, float %y, float %D) #
 define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %D) #1 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v4, v3
 ; GFX6-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
@@ -187,11 +186,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %
 ; GFX6-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v2, 1.0
 ; GFX9-NEXT:    v_rcp_f32_e32 v5, v3
@@ -207,11 +206,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %
 ; GFX9-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x2_arcp_daz:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v3, null, v2, v2, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v2, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -233,6 +232,7 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D
   %div1 = fdiv arcp float %y, %D
@@ -244,7 +244,6 @@ define <2 x float> @v_repeat_divisor_f32_x2_arcp_daz(float %x, float %y, float %
 define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f16_x2_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -264,26 +263,27 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 ; GFX6-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f16_x2_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f16_x2_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp half %x, %D
   %div1 = fdiv arcp half %y, %D
@@ -295,7 +295,6 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f64_x2_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0
 ; GFX6-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
@@ -313,11 +312,11 @@ define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %
 ; GFX6-NEXT:    v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0
 ; GFX6-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX6-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f64_x2_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0
 ; GFX9-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
 ; GFX9-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@@ -331,11 +330,11 @@ define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %
 ; GFX9-NEXT:    v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f64_x2_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], 1.0
 ; GFX11-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[4:5], 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -355,6 +354,7 @@ define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], v[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp double %x, %D
   %div1 = fdiv arcp double %y, %D
@@ -366,7 +366,6 @@ define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double %
 define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x3_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX6-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
@@ -381,11 +380,11 @@ define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, f
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x3_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v4, s[4:5], v3, v3, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v3, 1.0
 ; GFX9-NEXT:    v_rcp_f32_e32 v6, v4
@@ -400,11 +399,11 @@ define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, f
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x3_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v4, null, v3, v3, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v3, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -425,6 +424,7 @@ define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, f
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D
   %div1 = fdiv arcp float %y, %D
@@ -438,7 +438,6 @@ define <3 x float> @v_repeat_divisor_f32_x3_arcp(float %x, float %y, float %z, f
 define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, float %w, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x4_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v5, s[4:5], v4, v4, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v6, v5
 ; GFX6-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
@@ -454,11 +453,11 @@ define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, f
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x4_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v5, s[4:5], v4, v4, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v4, 1.0
 ; GFX9-NEXT:    v_rcp_f32_e32 v7, v5
@@ -474,11 +473,11 @@ define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, f
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x4_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v5, null, v4, v4, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v4, 1.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -500,6 +499,7 @@ define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, f
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D
   %div1 = fdiv arcp float %y, %D
@@ -515,7 +515,6 @@ define <4 x float> @v_repeat_divisor_f32_x4_arcp(float %x, float %y, float %z, f
 define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f16_x3_arcp:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -538,21 +537,21 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f16_x3_arcp:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v3, v3
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; GFX9-NEXT:    v_mul_f16_e32 v4, v1, v3
 ; GFX9-NEXT:    v_mul_f16_e32 v1, v2, v3
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f16_x3_arcp:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f16_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v3
@@ -560,6 +559,7 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp half %x, %D
   %div1 = fdiv arcp half %y, %D
@@ -573,7 +573,6 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2 x float> %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_v2f32_x2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, 1.0
 ; GFX6-NEXT:    v_rcp_f32_e32 v7, v6
 ; GFX6-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
@@ -600,11 +599,11 @@ define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_v2f32_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_div_scale_f32 v6, s[4:5], v4, v4, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v7, s[4:5], v5, v5, 1.0
 ; GFX9-NEXT:    v_div_scale_f32 v8, vcc, 1.0, v4, 1.0
@@ -632,11 +631,11 @@ define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2
 ; GFX9-NEXT:    v_div_fixup_f32 v5, v7, v5, 1.0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_v2f32_x2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_div_scale_f32 v6, null, v4, v4, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v7, null, v5, v5, 1.0
 ; GFX11-NEXT:    v_div_scale_f32 v12, vcc_lo, 1.0, v4, 1.0
@@ -670,6 +669,7 @@ define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2
 ; GFX11-NEXT:    v_div_fixup_f32 v5, v7, v5, 1.0
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp <2 x float> %x, %D
   %div1 = fdiv arcp <2 x float> %y, %D
@@ -680,7 +680,6 @@ define <4 x float> @v_repeat_divisor_v2f32_x2(<2 x float> %x, <2 x float> %y, <2
 define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
 ; GFX6-NEXT:    v_frexp_mant_f32_e32 v3, v2
 ; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s4
@@ -691,11 +690,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x2_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v3, v2
 ; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX9-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -703,11 +702,11 @@ define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D)
 ; GFX9-NEXT:    v_ldexp_f32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x2_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v3, v2
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -718,6 +717,7 @@ define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D, !fpmath !0
   %div1 = fdiv arcp float %y, %D, !fpmath !0
@@ -729,27 +729,27 @@ define <2 x float> @v_repeat_divisor_f32_x2_ulp25(float %x, float %y, float %D)
 define <2 x float> @v_repeat_divisor_f32_x2_daz_ulp25(float %x, float %y, float %D) #1 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_f32_x2_daz_ulp25:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp float %x, %D, !fpmath !0
   %div1 = fdiv arcp float %y, %D, !fpmath !0
@@ -761,7 +761,6 @@ define <2 x float> @v_repeat_divisor_f32_x2_daz_ulp25(float %x, float %y, float
 define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x half> %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -800,21 +799,21 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
 ; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -823,6 +822,7 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX11-NEXT:    v_pack_b32_f16 v2, v2, v3
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX11-NEXT:    v_pk_mul_f16 v1, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp <2 x half> %x, %D
   %div1 = fdiv arcp <2 x half> %y, %D
@@ -833,7 +833,6 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x half> %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -891,11 +890,11 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v4, v4, v7
 ; GFX6-NEXT:    v_mul_f32_e32 v5, v5, v8
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rcp_f16_e32 v4, v4
 ; GFX9-NEXT:    v_rcp_f16_e32 v5, v5
@@ -908,11 +907,11 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v4
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v4, 16
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX11-NEXT:    v_rcp_f16_e32 v4, v4
 ; GFX11-NEXT:    v_rcp_f16_e32 v5, v5
@@ -929,6 +928,7 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp <3 x half> %x, %D
   %div1 = fdiv arcp <3 x half> %y, %D
diff --git a/llvm/test/CodeGen/AMDGPU/replace-store-of-insert-load.ll b/llvm/test/CodeGen/AMDGPU/replace-store-of-insert-load.ll
index 35a602af68c0a8e..0091d6a149ab730 100644
--- a/llvm/test/CodeGen/AMDGPU/replace-store-of-insert-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/replace-store-of-insert-load.ll
@@ -8,9 +8,8 @@
 define void @testcase_0(ptr addrspace(1) %in, float %arg) {
 ; CHECK-LABEL: testcase_0:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %loaded = load <4 x float>, ptr addrspace(1) %in
   %modified = insertelement <4 x float> %loaded, float %arg, i64 3
@@ -21,9 +20,8 @@ define void @testcase_0(ptr addrspace(1) %in, float %arg) {
 define void @testcase_1(ptr addrspace(1) %in, float %arg) {
 ; CHECK-LABEL: testcase_1:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dword v[0:1], v2, off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %loaded = load <6 x float>, ptr addrspace(1) %in
   %modified = insertelement <6 x float> %loaded, float %arg, i64 4
@@ -34,9 +32,8 @@ define void @testcase_1(ptr addrspace(1) %in, float %arg) {
 define void @testcase_2(ptr addrspace(1) %in, double %arg) {
 ; CHECK-LABEL: testcase_2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %loaded = load <4 x double>, ptr addrspace(1) %in
   %modified = insertelement <4 x double> %loaded, double %arg, i64 1
@@ -47,9 +44,8 @@ define void @testcase_2(ptr addrspace(1) %in, double %arg) {
 define void @testcase_3(ptr addrspace(1) %in, double %arg) {
 ; CHECK-LABEL: testcase_3:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:56
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %loaded = load <8 x double>, ptr addrspace(1) %in
   %modified = insertelement <8 x double> %loaded, double %arg, i64 7
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 95fd7b3183fc354..ec6bfbc2b328b8e 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -323,7 +323,6 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ;
 ; SI-LABEL: test_rotl_i16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -339,12 +338,11 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_rotl_i16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
@@ -359,12 +357,11 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_rotl_i16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v6, v[2:3], off offset:48
 ; GFX10-NEXT:    global_load_ushort v7, v[0:1], off offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -374,11 +371,11 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX10-NEXT:    v_lshrrev_b16 v0, v0, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    global_store_short v[4:5], v0, off offset:8
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_rotl_i16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v2, v[2:3], off offset:48
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
@@ -389,6 +386,7 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-NEXT:    v_lshrrev_b16 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX11-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index b201a506227dba1..34e74d1b6af121c 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -280,7 +280,6 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ;
 ; SI-LABEL: test_rotr_i16:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
@@ -296,12 +295,11 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_rotr_i16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
@@ -316,12 +314,11 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_rotr_i16:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v6, v[2:3], off offset:48
 ; GFX10-NEXT:    global_load_ushort v7, v[0:1], off offset:32
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
@@ -331,11 +328,11 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX10-NEXT:    global_store_short v[4:5], v0, off offset:8
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_rotr_i16:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v2, v[2:3], off offset:48
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
@@ -346,6 +343,7 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v1, v0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX11-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 35219ccb49fff06..b6533a53767e849 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -15,62 +15,62 @@
 define float @v_roundeven_f32(float %x) {
 ; GFX6-LABEL: v_roundeven_f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f32:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f32:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f32:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f32:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f32:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call float @llvm.roundeven.f32(float %x)
   ret float %roundeven
@@ -79,72 +79,72 @@ define float @v_roundeven_f32(float %x) {
 define <2 x float> @v_roundeven_v2f32(<2 x float> %x) {
 ; GFX6-LABEL: v_roundeven_v2f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_v2f32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f32:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v2f32:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v2f32:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v2f32:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_v2f32:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x)
   ret <2 x float> %roundeven
@@ -153,82 +153,82 @@ define <2 x float> @v_roundeven_v2f32(<2 x float> %x) {
 define <3 x float> @v_roundeven_v3f32(<3 x float> %x) {
 ; GFX6-LABEL: v_roundeven_v3f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v3f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v3f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX8-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v3f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX9-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_v3f32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v2, v2
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v3f32:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v2, v2
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v3f32:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v2, v2
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v3f32:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v2, v2
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v3f32:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v2, v2
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_v3f32:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v2, v2
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x)
   ret <3 x float> %roundeven
@@ -237,92 +237,92 @@ define <3 x float> @v_roundeven_v3f32(<3 x float> %x) {
 define <4 x float> @v_roundeven_v4f32(<4 x float> %x) {
 ; GFX6-LABEL: v_roundeven_v4f32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX6-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX6-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v4f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX7-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX7-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v4f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX8-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX8-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v4f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX9-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX9-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_v4f32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v2, v2
 ; GFX10PLUS-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v4f32:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v4f32:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v4f32:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v4f32:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_v4f32:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x)
   ret <4 x float> %roundeven
@@ -331,70 +331,70 @@ define <4 x float> @v_roundeven_v4f32(<4 x float> %x) {
 define half @v_roundeven_f16(half %x) {
 ; GFX6-LABEL: v_roundeven_f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f16:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f16:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f16:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f16:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f16:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call half @llvm.roundeven.f16(half %x)
   ret half %roundeven
@@ -403,112 +403,112 @@ define half @v_roundeven_f16(half %x) {
 define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v2f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX8-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX10:       ; %bb.0:
-; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v1, v1
 ; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x)
   ret <2 x half> %roundeven
@@ -517,7 +517,6 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v2f16_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -529,11 +528,11 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rndne_f32_e32 v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f16_fneg:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -545,48 +544,48 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_rndne_f32_e32 v1, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f16_fneg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_v2f16_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -597,11 +596,11 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -612,39 +611,40 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX8-NEXT:    v_rndne_f16_e64 v0, -v0
 ; SDAG_GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e64 v0, -v0
 ; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX10:       ; %bb.0:
-; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX10-NEXT:    v_rndne_f16_e64 v0, -v0
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e64 v0, -v0
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e64 v1, -v1
 ; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg)
@@ -654,7 +654,6 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX6-LABEL: v_roundeven_v4f16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -667,11 +666,11 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v4f16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
@@ -684,44 +683,44 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v4f16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f16_e32 v2, v0
 ; GFX8-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX8-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v4f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
 ; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v4f16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
 ; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
 ; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_roundeven_v4f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
@@ -730,11 +729,11 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX11-NEXT:    v_rndne_f16_e32 v3, v3
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -747,11 +746,11 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -764,44 +763,44 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v1, v1
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v2, v2
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e32 v3, v3
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX8-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX8-NEXT:    v_rndne_f16_e32 v1, v1
 ; SDAG_GFX8-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SDAG_GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v1, v1
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; SDAG_GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX10:       ; %bb.0:
-; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v1, v1
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v1, v1
@@ -810,6 +809,7 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v3, v3
 ; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; SDAG_GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
+; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
   ret <4 x half> %roundeven
@@ -819,62 +819,62 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 define float @v_roundeven_f32_fabs(float %x) {
 ; GFX6-LABEL: v_roundeven_f32_fabs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f32_fabs:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f32_fabs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f32_fabs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f32_fabs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e64 v0, |v0|
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f32_fabs:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e64 v0, |v0|
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f32_fabs:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e64 v0, |v0|
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f32_fabs:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e64 v0, |v0|
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f32_fabs:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e64 v0, |v0|
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f32_fabs:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e64 v0, |v0|
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x)
   %roundeven = call float @llvm.roundeven.f32(float %fabs.x)
@@ -938,62 +938,62 @@ define amdgpu_ps float @s_roundeven_f32(float inreg %x) {
 define float @v_roundeven_f32_fneg(float %x) {
 ; GFX6-LABEL: v_roundeven_f32_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f32_fneg:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f32_fneg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f32_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f32_fneg:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f32_e64 v0, -v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f32_fneg:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_rndne_f32_e64 v0, -v0
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f32_fneg:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f32_e64 v0, -v0
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f32_fneg:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f32_e64 v0, -v0
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f32_fneg:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f32_e64 v0, -v0
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f32_fneg:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f32_e64 v0, -v0
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %roundeven = call float @llvm.roundeven.f32(float %neg.x)
@@ -1003,7 +1003,6 @@ define float @v_roundeven_f32_fneg(float %x) {
 define double @v_roundeven_f64(double %x) {
 ; GFX6-LABEL: v_roundeven_f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_or_b32_e32 v3, 0x43300000, v3
@@ -1014,35 +1013,35 @@ define double @v_roundeven_f64(double %x) {
 ; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f64:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_brev_b32 s4, -2
 ; SDAG_GFX6-NEXT:    v_mov_b32_e32 v2, 0x43300000
 ; SDAG_GFX6-NEXT:    v_bfi_b32 v3, s4, v2, v1
@@ -1054,30 +1053,31 @@ define double @v_roundeven_f64(double %x) {
 ; SDAG_GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f64:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f64:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f64:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f64:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call double @llvm.roundeven.f64(double %x)
   ret double %roundeven
@@ -1086,7 +1086,6 @@ define double @v_roundeven_f64(double %x) {
 define double @v_roundeven_f64_fneg(double %x) {
 ; GFX6-LABEL: v_roundeven_f64_fneg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x80000000, v6
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
@@ -1098,35 +1097,35 @@ define double @v_roundeven_f64_fneg(double %x) {
 ; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_f64_fneg:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_f64_fneg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_f64_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_f64_fneg:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f64_fneg:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
 ; SDAG_GFX6-NEXT:    s_brev_b32 s4, -2
 ; SDAG_GFX6-NEXT:    v_mov_b32_e32 v2, 0x43300000
@@ -1139,30 +1138,31 @@ define double @v_roundeven_f64_fneg(double %x) {
 ; SDAG_GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_f64_fneg:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_f64_fneg:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_f64_fneg:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_f64_fneg:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f64_e64 v[0:1], -v[0:1]
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %roundeven = call double @llvm.roundeven.f64(double %neg.x)
@@ -1172,7 +1172,6 @@ define double @v_roundeven_f64_fneg(double %x) {
 define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
 ; GFX6-LABEL: v_roundeven_v2f64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX6-NEXT:    v_or_b32_e32 v5, 0x43300000, v5
@@ -1190,39 +1189,39 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
 ; GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_roundeven_v2f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX7-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_roundeven_v2f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX8-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_roundeven_v2f64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX9-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_roundeven_v2f64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; GFX10PLUS-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f64:
 ; SDAG_GFX6:       ; %bb.0:
-; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_brev_b32 s6, -2
 ; SDAG_GFX6-NEXT:    v_mov_b32_e32 v8, 0x43300000
 ; SDAG_GFX6-NEXT:    v_bfi_b32 v5, s6, v8, v1
@@ -1240,34 +1239,35 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
 ; SDAG_GFX6-NEXT:    v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5]
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SDAG_GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; SDAG_GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX7-LABEL: v_roundeven_v2f64:
 ; SDAG_GFX7:       ; %bb.0:
-; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; SDAG_GFX7-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; SDAG_GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX8-LABEL: v_roundeven_v2f64:
 ; SDAG_GFX8:       ; %bb.0:
-; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; SDAG_GFX8-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; SDAG_GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX9-LABEL: v_roundeven_v2f64:
 ; SDAG_GFX9:       ; %bb.0:
-; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; SDAG_GFX9-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10PLUS-LABEL: v_roundeven_v2f64:
 ; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f64_e32 v[0:1], v[0:1]
 ; SDAG_GFX10PLUS-NEXT:    v_rndne_f64_e32 v[2:3], v[2:3]
+; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x)
   ret <2 x double> %roundeven
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index 415ed89668abbd5..85fcdf883dcefdb 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -895,21 +895,20 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
 define float @v_neg_rsq_neg_f32(float %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -928,11 +927,11 @@ define float @v_neg_rsq_neg_f32(float %val) {
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -959,11 +958,11 @@ define float @v_neg_rsq_neg_f32(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -987,6 +986,7 @@ define float @v_neg_rsq_neg_f32(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg float %val
   %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
@@ -997,25 +997,24 @@ define float @v_neg_rsq_neg_f32(float %val) {
 define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s5
@@ -1051,11 +1050,11 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
@@ -1106,11 +1105,11 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
@@ -1156,6 +1155,7 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg <2 x float> %val
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
@@ -1166,21 +1166,20 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -1200,11 +1199,11 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -1232,11 +1231,11 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
@@ -1261,6 +1260,7 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %val0.neg = fneg float %val0
   %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
@@ -1272,25 +1272,24 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s5
@@ -1328,11 +1327,11 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
@@ -1385,11 +1384,11 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
@@ -1437,6 +1436,7 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %val0.fneg = fneg <2 x float> %val0
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
@@ -1448,21 +1448,20 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
 define float @v_neg_rsq_f32(float %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1481,11 +1480,11 @@ define float @v_neg_rsq_f32(float %val) {
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1512,11 +1511,11 @@ define float @v_neg_rsq_f32(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1540,6 +1539,7 @@ define float @v_neg_rsq_f32(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -1549,25 +1549,24 @@ define float @v_neg_rsq_f32(float %val) {
 define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
@@ -1602,11 +1601,11 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
@@ -1656,11 +1655,11 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
@@ -1705,6 +1704,7 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -1714,21 +1714,20 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
 define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1748,11 +1747,11 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 ; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1780,11 +1779,11 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -1809,6 +1808,7 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -1819,25 +1819,24 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
 ; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
 ; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
 ; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
@@ -1874,11 +1873,11 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
@@ -1930,11 +1929,11 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
@@ -1981,6 +1980,7 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -1991,19 +1991,18 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 define float @v_rsq_f32(float %val) {
 ; GCN-DAZ-LABEL: v_rsq_f32:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
 ; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4b800000
 ; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2013,6 +2012,7 @@ define float @v_rsq_f32(float %val) {
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2022,30 +2022,29 @@ define float @v_rsq_f32(float %val) {
 define { float, float } @v_rsq_f32_multi_use(float %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_sqrt_f32_e32 v2, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v0
 ; GCN-DAZ-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_sqrt_f32_e32 v2, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2072,11 +2071,11 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2100,6 +2099,7 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
@@ -2111,26 +2111,25 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 define float @v_rsq_f32_missing_contract0(float %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2157,11 +2156,11 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2185,6 +2184,7 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2194,26 +2194,25 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 define float @v_rsq_f32_missing_contract1(float %val) {
 ; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
 ; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
 ; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
 ; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2240,11 +2239,11 @@ define float @v_rsq_f32_missing_contract1(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
 ; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
 ; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2268,6 +2267,7 @@ define float @v_rsq_f32_missing_contract1(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv float 1.0, %sqrt, !fpmath !1
@@ -2279,21 +2279,20 @@ define float @v_rsq_f32_missing_contract1(float %val) {
 define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
 ; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2303,6 +2302,7 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
 ; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2314,21 +2314,20 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
 define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) {
 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
 ; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2338,6 +2337,7 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
 ; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2349,21 +2349,20 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %
 define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) {
 ; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
 ; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
 ; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2374,6 +2373,7 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %
 ; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
 ; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-IEEE-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2384,14 +2384,14 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %
 define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) {
 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_denormal:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-LABEL: v_rsq_f32_known_never_denormal:
 ; GCN-IEEE:       ; %bb.0:
-; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
@@ -2401,19 +2401,18 @@ define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) {
 define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 ; GCN-DAZ-LABEL: v_rsq_f32_known_never_posdenormal:
 ; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_known_never_posdenormal:
 ; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
 ; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4b800000
 ; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
@@ -2423,6 +2422,7 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 ; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x45800000
 ; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
 ; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 3dc565ceed0d0ba..cdb4752c00c3f2f 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -737,7 +737,6 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 define double @v_rsq_f64(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -776,11 +775,11 @@ define double @v_rsq_f64(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -819,11 +818,11 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -858,11 +857,11 @@ define double @v_rsq_f64(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -897,6 +896,7 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
@@ -906,7 +906,6 @@ define double @v_rsq_f64(double %x) {
 define double @v_rsq_f64_fabs(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_fabs:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -945,11 +944,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
@@ -988,11 +987,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -1027,11 +1026,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
@@ -1066,6 +1065,7 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %sqrt = call contract double @llvm.sqrt.f64(double %fabs.x)
@@ -1076,7 +1076,6 @@ define double @v_rsq_f64_fabs(double %x) {
 define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1115,11 +1114,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1158,11 +1157,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract0:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1197,11 +1196,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1236,6 +1235,7 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
@@ -1245,7 +1245,6 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1284,11 +1283,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1327,11 +1326,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64_missing_contract1:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1366,11 +1365,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1405,6 +1404,7 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv double 1.0, %sqrt
@@ -1414,7 +1414,6 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 define double @v_neg_rsq_f64(double %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1453,11 +1452,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1496,11 +1495,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1535,11 +1534,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -1574,6 +1573,7 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double -1.0, %sqrt
@@ -1583,7 +1583,6 @@ define double @v_neg_rsq_f64(double %x) {
 define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -1656,11 +1655,11 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_nop 0
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -1734,11 +1733,11 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_nop 0
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -1804,11 +1803,11 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -1875,6 +1874,7 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv <2 x double> <double 1.0, double 1.0>, %sqrt
@@ -1884,7 +1884,6 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -1957,11 +1956,11 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_nop 0
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2035,11 +2034,11 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_nop 0
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -2105,11 +2104,11 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -2176,6 +2175,7 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv <2 x double> <double -1.0, double -1.0>, %sqrt
@@ -2185,7 +2185,6 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2226,11 +2225,11 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2304,11 +2303,11 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_nop 0
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2345,11 +2344,11 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -2416,6 +2415,7 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv <2 x double> <double -1.0, double poison>, %sqrt
@@ -2425,7 +2425,6 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -2499,11 +2498,11 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-SDAG-NEXT:    s_nop 0
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2578,11 +2577,11 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_nop 0
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -2648,11 +2647,11 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[11:12], v[19:20]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -2719,6 +2718,7 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv <2 x double> <double -1.0, double 1.0>, %sqrt
@@ -2728,7 +2728,6 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 9
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -2767,11 +2766,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
@@ -2810,11 +2809,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 9
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
@@ -2849,11 +2848,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
@@ -2888,6 +2887,7 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call double @llvm.fabs.f64(double %x)
   %fneg.fabs = fneg double %fabs
@@ -2899,7 +2899,6 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -2938,11 +2937,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -2981,11 +2980,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3020,11 +3019,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3059,6 +3058,7 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
@@ -3068,7 +3068,6 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3099,11 +3098,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3134,11 +3133,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn_fdiv:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3169,11 +3168,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3204,6 +3203,7 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
@@ -3213,7 +3213,6 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 define double @v_rsq_f64__afn(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3244,11 +3243,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3279,11 +3278,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3314,11 +3313,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3349,6 +3348,7 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double 1.0, %sqrt
@@ -3358,7 +3358,6 @@ define double @v_rsq_f64__afn(double %x) {
 define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_f64__afn:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3390,11 +3389,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3426,11 +3425,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_f64__afn:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3462,11 +3461,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3498,6 +3497,7 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn double -1.0, %sqrt
@@ -3507,7 +3507,6 @@ define double @v_neg_rsq_f64__afn(double %x) {
 define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3538,11 +3537,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3573,11 +3572,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3608,11 +3607,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3643,6 +3642,7 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn ninf double 1.0, %sqrt
@@ -3652,7 +3652,6 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_nnan:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3683,11 +3682,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3718,11 +3717,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn_nnan:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3753,11 +3752,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3788,6 +3787,7 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan double 1.0, %sqrt
@@ -3797,7 +3797,6 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3828,11 +3827,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3863,11 +3862,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3898,11 +3897,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -3933,6 +3932,7 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt
@@ -3942,7 +3942,6 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -3974,11 +3973,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4010,11 +4009,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4046,11 +4045,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4082,6 +4081,7 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt
@@ -4091,7 +4091,6 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4130,11 +4129,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4173,11 +4172,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4212,11 +4211,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4251,6 +4250,7 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract nnan ninf double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract nnan ninf double 1.0, %sqrt
@@ -4260,7 +4260,6 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -4317,11 +4316,11 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4379,11 +4378,11 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -4440,11 +4439,11 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
@@ -4502,6 +4501,7 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5]
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
   %rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt
@@ -4667,7 +4667,6 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-SDAG-LABEL: v_rsq_f64_unsafe:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4698,11 +4697,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4733,11 +4732,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; SI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_f64_unsafe:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -4768,11 +4767,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -4803,6 +4802,7 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3]
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.sqrt.f64(double %x)
   %rsq = fdiv double 1.0, %sqrt
@@ -4812,7 +4812,6 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_rsq_amdgcn_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0x3ff00000
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4829,11 +4828,11 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_rsq_amdgcn_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x3ff00000
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
@@ -4850,11 +4849,11 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_rsq_amdgcn_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -4867,11 +4866,11 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_rsq_amdgcn_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -4884,6 +4883,7 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x)
   %rsq = fdiv contract double 1.0, %sqrt
@@ -4893,7 +4893,6 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) {
 define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-SDAG-NEXT:    s_mov_b32 s6, 0xbff00000
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -4910,11 +4909,11 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
@@ -4931,11 +4930,11 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-SDAG-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-SDAG-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -4948,11 +4947,11 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_neg_rsq_amdgcn_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_sqrt_f64_e32 v[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0
 ; VI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
@@ -4965,6 +4964,7 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x)
   %rsq = fdiv contract double -1.0, %sqrt
@@ -5066,7 +5066,6 @@ define amdgpu_ps <2 x i32> @s_rsq_amdgcn_sqrt_f64(double inreg %x) {
 define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5104,11 +5103,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5146,11 +5145,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_div_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5185,11 +5184,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5224,6 +5223,7 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %y)
   %rsq = fdiv contract double %x, %sqrt
@@ -5233,7 +5233,6 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5271,11 +5270,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5313,11 +5312,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_div_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5352,11 +5351,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5391,6 +5390,7 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call double @llvm.sqrt.f64(double %y)
   %rsq = fdiv arcp double %x, %sqrt
@@ -5400,7 +5400,6 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5438,11 +5437,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5480,11 +5479,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
@@ -5519,11 +5518,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
@@ -5558,6 +5557,7 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %y)
   %rsq = fdiv contract arcp double %x, %sqrt
@@ -5567,7 +5567,6 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
-; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; SI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -5608,11 +5607,11 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; SI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; SI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7]
+; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -5653,11 +5652,11 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
 ; SI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7]
+; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
-; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
@@ -5694,11 +5693,11 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-SDAG-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-SDAG-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5]
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
@@ -5735,6 +5734,7 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
 ; VI-GISEL-NEXT:    v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9]
 ; VI-GISEL-NEXT:    v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5]
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract double @llvm.sqrt.f64(double %x)
   %rsq = fdiv contract double 256.0, %sqrt
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 9c5214338c54a79..699bcd5dbc67bf8 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -8,39 +8,39 @@
 define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_saddsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff80
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x7f
 ; GFX6-NEXT:    v_med3_i32 v0, v0, s4, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_min_i16_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_max_i16_e32 v0, 0xff80, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -49,18 +49,17 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
 define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_saddsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX6-NEXT:    v_med3_i32 v0, v0, s4, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
@@ -68,18 +67,19 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -88,7 +88,6 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_saddsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
@@ -96,11 +95,11 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
@@ -108,18 +107,19 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -128,7 +128,6 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
@@ -143,11 +142,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
@@ -166,18 +165,19 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -186,7 +186,6 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
@@ -206,11 +205,11 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
@@ -236,20 +235,21 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v3i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
   ret <3 x i16> %result
@@ -258,7 +258,6 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_saddsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
@@ -283,11 +282,11 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
@@ -324,20 +323,21 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -347,7 +347,6 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
@@ -362,11 +361,11 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
@@ -381,20 +380,21 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_saddsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -403,7 +403,6 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_saddsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -413,11 +412,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -427,11 +426,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -441,11 +440,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
@@ -455,11 +454,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
@@ -468,6 +467,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
index 29c82db6f8204ed..ac2e04d6813e15f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
@@ -4,13 +4,13 @@
 define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i32 %dst) {
 ; CHECK-LABEL: example:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v3, 4, v0
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    scratch_load_b32 v2, v0, off
 ; CHECK-NEXT:    scratch_load_b32 v3, v3, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[4:7], 0 offen
+; CHECK-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 
   %x0 = load i32, ptr addrspace(5) %src
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index adc4c85aa60dc07..b4ba5b24b505bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -237,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_sdiv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v4, vcc
@@ -357,11 +356,11 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v4, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
@@ -394,6 +393,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v11, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -458,6 +458,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v2, v9, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, %y
   ret i64 %result
@@ -533,7 +534,6 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_sdiv24_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
@@ -547,11 +547,11 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv24_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
@@ -565,6 +565,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
@@ -1277,7 +1278,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
@@ -1380,11 +1380,11 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
@@ -1406,6 +1406,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1466,6 +1467,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 24, %x
   ret i64 %result
@@ -1474,7 +1476,6 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
@@ -1573,11 +1574,11 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
@@ -1600,6 +1601,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1661,6 +1663,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v4, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 32768, %x
   ret i64 %result
@@ -1669,17 +1672,16 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 15
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v2, v1
@@ -1700,6 +1702,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v7, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB13_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1758,6 +1761,7 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v3, v6, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, 32768
   ret i64 %result
@@ -1884,7 +1888,6 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x
 define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv24_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -1900,11 +1903,11 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv24_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -1920,6 +1923,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
@@ -1929,7 +1933,6 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -1945,11 +1948,11 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -1965,6 +1968,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 32768, %x.shr
@@ -1974,17 +1978,16 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_sdiv24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 15
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -1999,6 +2002,7 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 32768
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 11c71a71f266f58..3d3236808fb22fc 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2101,7 +2101,6 @@ bb11:                                             ; preds = %bb10, %bb2
 define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA-LABEL: crash_lshlrevb16_not_reg_op:
 ; NOSDWA:       ; %bb.0: ; %bb0
-; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0xff
 ; NOSDWA-NEXT:    v_and_b32_e32 v0, s4, v0
@@ -2119,12 +2118,11 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA-NEXT:    s_mov_b64 vcc, vcc
 ; NOSDWA-NEXT:    s_cbranch_vccnz .LBB22_1
 ; NOSDWA-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; NOSDWA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX89:       ; %bb.0: ; %bb0
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX89-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s4
@@ -2141,12 +2139,11 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89-NEXT:    s_mov_b64 vcc, vcc
 ; GFX89-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX89-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX9:       ; %bb.0: ; %bb0
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v0, 8, 1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2162,12 +2159,11 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9-NEXT:    s_mov_b64 vcc, vcc
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX9-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: crash_lshlrevb16_not_reg_op:
 ; GFX10:       ; %bb.0: ; %bb0
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 8, 1
 ; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
 ; GFX10-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2182,7 +2178,7 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-NEXT:    flat_store_byte v[1:2], v3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX10-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %1 = alloca [2 x i8], align 1, addrspace(5)
   %2 = getelementptr [2 x i8], ptr addrspace(5) %1, i32 0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
index bdd07ff5926aea1..a779345b643d68c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-xor.ll
@@ -4,9 +4,9 @@
 define i32 @xori64i32(i64 %a) {
 ; CHECK-LABEL: xori64i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shr4 = ashr i64 %a, 63
   %conv5 = trunc i64 %shr4 to i32
@@ -17,9 +17,9 @@ define i32 @xori64i32(i64 %a) {
 define i64 @selecti64i64(i64 %a) {
 ; CHECK-LABEL: selecti64i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x7fffffff, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i64 %a, -1
   %s = select i1 %c, i64 2147483647, i64 -2147483648
@@ -29,9 +29,9 @@ define i64 @selecti64i64(i64 %a) {
 define i32 @selecti64i32(i64 %a) {
 ; CHECK-LABEL: selecti64i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i64 %a, -1
   %s = select i1 %c, i32 2147483647, i32 -2147483648
@@ -41,10 +41,10 @@ define i32 @selecti64i32(i64 %a) {
 define i64 @selecti32i64(i32 %a) {
 ; CHECK-LABEL: selecti32i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x7fffffff, v1
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i32 %a, -1
   %s = select i1 %c, i64 2147483647, i64 -2147483648
@@ -56,9 +56,9 @@ define i64 @selecti32i64(i32 %a) {
 define i8 @xori32i8(i32 %a) {
 ; CHECK-LABEL: xori32i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x54, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shr4 = ashr i32 %a, 31
   %conv5 = trunc i32 %shr4 to i8
@@ -69,9 +69,9 @@ define i8 @xori32i8(i32 %a) {
 define i32 @selecti32i32(i32 %a) {
 ; CHECK-LABEL: selecti32i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x54, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i32 %a, -1
   %s = select i1 %c, i32 84, i32 -85
@@ -81,9 +81,9 @@ define i32 @selecti32i32(i32 %a) {
 define i8 @selecti32i8(i32 %a) {
 ; CHECK-LABEL: selecti32i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, 0x54, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i32 %a, -1
   %s = select i1 %c, i8 84, i8 -85
@@ -93,11 +93,11 @@ define i8 @selecti32i8(i32 %a) {
 define i32 @selecti8i32(i8 %a) {
 ; CHECK-LABEL: selecti8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0x54
 ; CHECK-NEXT:    v_ashrrev_i16 v0, 7, v0
 ; CHECK-NEXT:    v_xor_b32_sdwa v0, sext(v0), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sgt i8 %a, -1
   %s = select i1 %c, i32 84, i32 -85
@@ -107,9 +107,9 @@ define i32 @selecti8i32(i8 %a) {
 define i32 @icmpasreq(i32 %input, i32 %a, i32 %b) {
 ; CHECK-LABEL: icmpasreq:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sh = ashr i32 %input, 31
   %c = icmp eq i32 %sh, -1
@@ -120,9 +120,9 @@ define i32 @icmpasreq(i32 %input, i32 %a, i32 %b) {
 define i32 @icmpasrne(i32 %input, i32 %a, i32 %b) {
 ; CHECK-LABEL: icmpasrne:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_lt_i32_e32 vcc_lo, -1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sh = ashr i32 %input, 31
   %c = icmp ne i32 %sh, -1
@@ -133,11 +133,11 @@ define i32 @icmpasrne(i32 %input, i32 %a, i32 %b) {
 define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
 ; CHECK-LABEL: oneusecmp:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
 ; CHECK-NEXT:    v_xad_u32 v0, 0x7f, v3, v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %c = icmp sle i32 %a, -1
   %s = select i1 %c, i32 -128, i32 127
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index b2919cb4c7a22b2..4834f79fc240e76 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -10,7 +10,6 @@
 define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fabs_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -20,23 +19,24 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -49,7 +49,6 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, half %y, half %w, half %z) {
 ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -62,25 +61,26 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v4
 ; CI-NEXT:    v_add_f32_e64 v1, |v1|, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v4
 ; VI-NEXT:    v_add_f16_e64 v1, |v1|, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v1, |v1|, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -96,7 +96,6 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
 define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -107,26 +106,27 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half
 ; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v4
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v3
 ; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -141,7 +141,6 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half
 define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, half %y, half %z, half %w) {
 ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -154,25 +153,26 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v3
 ; CI-NEXT:    v_add_f32_e64 v1, |v2|, v4
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v3
 ; VI-NEXT:    v_add_f16_e64 v1, |v2|, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v1, |v2|, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -188,7 +188,6 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
 define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fabs_var_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -198,25 +197,26 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_var_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_var_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -228,7 +228,6 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fabs_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -236,26 +235,27 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -268,32 +268,32 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
 define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; CI-LABEL: add_select_fabs_negk_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negk_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half -2.0, half -1.0
@@ -305,32 +305,32 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 define half @add_select_posk_posk_f16(i32 %c, half %x) {
 ; CI-LABEL: add_select_posk_posk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_posk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_posk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half 2.0, half 1.0
@@ -341,7 +341,6 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
 define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_negk_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -349,26 +348,27 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -380,7 +380,6 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
 define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_negliteralk_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, 0xc4800000
@@ -389,26 +388,27 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negliteralk_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xe400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negliteralk_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -420,7 +420,6 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
 define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fabs_posk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -428,24 +427,25 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_posk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_posk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -457,7 +457,6 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
 define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_posk_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -465,24 +464,25 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; CI-NEXT:    v_add_f32_e64 v0, |v0|, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -494,7 +494,6 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
 define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fneg_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -504,23 +503,24 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -533,7 +533,6 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
 define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, half %y, half %z, half %w) {
 ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -546,25 +545,26 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v4, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_sub_f16_e32 v1, v4, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v1, v4, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -580,7 +580,6 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
 define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -591,26 +590,27 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_xor_b32_e32 v4, 0x8000, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, v4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -625,7 +625,6 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half
 define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, half %y, half %z, half %w) {
 ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -638,25 +637,26 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v4, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_sub_f16_e32 v1, v4, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v1, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -672,7 +672,6 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
 define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fneg_var_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -682,25 +681,26 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_var_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_var_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -712,7 +712,6 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fneg_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -720,24 +719,25 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -749,7 +749,6 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
 define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fneg_inv2pi_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, 0xbe230000
@@ -758,24 +757,25 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_inv2pi_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xb118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_inv2pi_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -787,7 +787,6 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
 define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fneg_neginv2pi_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_mov_b32_e32 v3, 0x3e230000
@@ -796,24 +795,25 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_neginv2pi_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_neginv2pi_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -825,32 +825,32 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
 define half @add_select_negk_negk_f16(i32 %c, half %x) {
 ; CI-LABEL: add_select_negk_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half -2.0, half -1.0
@@ -861,7 +861,6 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
 define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 ; CI-LABEL: add_select_negliteralk_negliteralk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_mov_b32_e32 v2, 0xc5800000
 ; CI-NEXT:    v_mov_b32_e32 v3, 0xc5000000
@@ -869,26 +868,27 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negliteralk_negliteralk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xec00
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xe800
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negliteralk_negliteralk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xe800
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half -2048.0, half -4096.0
@@ -899,32 +899,32 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
 ; CI-LABEL: add_select_fneg_negk_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_sub_f32_e32 v0, v1, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_negk_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_negk_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, half -2.0, half -1.0
@@ -936,7 +936,6 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
 define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_negk_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -944,24 +943,25 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -973,7 +973,6 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
 define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_fneg_posk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -981,24 +980,25 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_posk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_posk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -1010,7 +1010,6 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
 define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: add_select_posk_fneg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -1018,24 +1017,25 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_fneg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_fneg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fneg half %x
@@ -1047,7 +1047,6 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
 define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_negfabs_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1057,27 +1056,28 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negfabs_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negfabs_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1091,7 +1091,6 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fabs_negfabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1101,27 +1100,28 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negfabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_or_b32_e32 v2, 0x8000, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negfabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1135,7 +1135,6 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_neg_fabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e64 v1, -v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1145,27 +1144,28 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_neg_fabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_neg_fabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub half -0.000000e+00, %x
@@ -1178,7 +1178,6 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_fabs_neg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e64 v2, -v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1188,27 +1187,28 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_neg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_neg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1221,7 +1221,6 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_neg_negfabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1231,25 +1230,26 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_neg_negfabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_neg_negfabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fneg.x = fsub half -0.000000e+00, %x
@@ -1263,7 +1263,6 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-LABEL: add_select_negfabs_neg_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
@@ -1273,25 +1272,26 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v3, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negfabs_neg_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negfabs_neg_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1305,7 +1305,6 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: mul_select_negfabs_posk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1313,26 +1312,27 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_posk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negfabs_posk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1345,7 +1345,6 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
 define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: mul_select_posk_negfabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -1353,26 +1352,27 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v1, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_posk_negfabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_posk_negfabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1385,7 +1385,6 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
 define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: mul_select_negfabs_negk_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1393,26 +1392,27 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_negk_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negfabs_negk_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1425,7 +1425,6 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
 define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-LABEL: mul_select_negk_negfabs_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -1433,26 +1432,27 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v1, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negk_negfabs_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negk_negfabs_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fabs.x = call half @llvm.fabs.f16(half %x)
@@ -1469,60 +1469,60 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
 define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_add_f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-SAFE-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; CI-SAFE-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_add_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f16_e32 v1, 4.0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_add_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v1, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NSZ-NEXT:    v_sub_f32_e32 v1, -4.0, v1
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_add_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v1, -4.0, v1
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v1, -4.0, v1
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %add = fadd half %x, 4.0
@@ -1534,60 +1534,60 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
 define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-SAFE-NEXT:    v_add_f32_e32 v1, -4.0, v1
 ; CI-SAFE-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_add_f16_e32 v1, -4.0, v1
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v1, -4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NSZ-NEXT:    v_sub_f32_e32 v1, 4.0, v1
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v1, 4.0, v1
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v1, 4.0, v1
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %add = fsub half %x, 4.0
@@ -1599,30 +1599,30 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
 define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) {
 ; CI-LABEL: select_fneg_posk_src_mul_f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_mul_f32_e32 v1, -4.0, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: select_fneg_posk_src_mul_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v1, -4.0, v1
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_posk_src_mul_f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mul_f16_e32 v1, -4.0, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %mul = fmul half %x, 4.0
@@ -1634,7 +1634,6 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) {
 define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_fma_f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1642,31 +1641,31 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 ; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-SAFE-NEXT:    v_fma_f32 v1, v1, 4.0, v2
 ; CI-SAFE-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fma_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, v1, 4.0, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v2
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1674,24 +1673,25 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 ; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NSZ-NEXT:    v_fma_f32 v1, v1, -4.0, -v2
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z)
@@ -1703,7 +1703,6 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1712,31 +1711,31 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
 ; CI-SAFE-NEXT:    v_mul_f32_e32 v1, 4.0, v1
 ; CI-SAFE-NEXT:    v_add_f32_e32 v1, v1, v2
 ; CI-SAFE-NEXT:    v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_fma_f16 v1, v1, 4.0, v2
 ; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v2
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1745,24 +1744,25 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
 ; CI-NSZ-NEXT:    v_mul_f32_e32 v1, -4.0, v1
 ; CI-NSZ-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
index f4ad83bc321d105..6fd30d920c073e0 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.legal.f16.ll
@@ -5,11 +5,11 @@
 define half @select_fneg_posk_src_rcp_f16(i32 %c, half %x, half %y) {
 ; VI-LABEL: select_fneg_posk_src_rcp_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_rcp_f16_e64 v1, -v1
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %rcp = call half @llvm.amdgcn.rcp.f16(half %x)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index d9dfa8a27e2dfcb..c003d0ed0d7d596 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -12,7 +12,6 @@
 define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fabs_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -31,11 +30,11 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v7
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -47,11 +46,11 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -63,11 +62,11 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -81,6 +80,7 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -93,7 +93,6 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %w, <2 x half> %z) {
 ; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -118,11 +117,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v9
 ; CI-NEXT:    v_add_f32_e32 v2, v2, v6
 ; CI-NEXT:    v_add_f32_e32 v3, v3, v7
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -137,11 +136,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v2, v2, v4
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -154,11 +153,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v5
 ; GFX9-NEXT:    v_pk_add_f16 v1, v2, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -173,6 +172,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; GFX11-NEXT:    v_pk_add_f16 v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -188,7 +188,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -207,11 +206,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v7
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -224,11 +223,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -241,11 +240,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -260,6 +259,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -274,7 +274,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z, <2 x half> %w) {
 ; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -299,11 +298,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v7
 ; CI-NEXT:    v_add_f32_e32 v2, v4, v8
 ; CI-NEXT:    v_add_f32_e32 v3, v5, v9
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -318,11 +317,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v2, v3, v5
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -335,11 +334,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-NEXT:    v_pk_add_f16 v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -354,6 +353,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; GFX11-NEXT:    v_pk_add_f16 v1, v3, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -369,7 +369,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fabs_var_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -388,11 +387,11 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v7
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_var_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -403,11 +402,11 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_var_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -418,11 +417,11 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_var_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -434,6 +433,7 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -445,7 +445,6 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fabs_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -460,11 +459,11 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -475,11 +474,11 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -490,11 +489,11 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -505,6 +504,7 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -517,7 +517,6 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: add_select_fabs_negk_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -532,11 +531,11 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negk_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -546,11 +545,11 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; VI-NEXT:    v_add_f16_sdwa v1, |v1|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e64 v0, |v0|, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -560,11 +559,11 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negk_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -575,6 +574,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half -2.0, half -2.0>, <2 x half> <half -1.0, half -1.0>
@@ -586,7 +586,6 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: add_select_posk_posk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -597,11 +596,11 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 2.0, vcc
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_posk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -611,11 +610,11 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_posk_posk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -624,11 +623,11 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_posk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -638,6 +637,7 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 1.0, half 1.0>
@@ -648,7 +648,6 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_negk_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -663,11 +662,11 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -678,11 +677,11 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -693,11 +692,11 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -708,6 +707,7 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -719,7 +719,6 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_negliteralk_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -735,11 +734,11 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negliteralk_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xe400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -750,11 +749,11 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -765,11 +764,11 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -780,6 +779,7 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -791,7 +791,6 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fabs_posk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -806,11 +805,11 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_posk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -821,11 +820,11 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_posk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -836,11 +835,11 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_posk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -851,6 +850,7 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -862,7 +862,6 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_posk_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -877,11 +876,11 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -892,11 +891,11 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_posk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -907,11 +906,11 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -922,6 +921,7 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -933,7 +933,6 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fneg_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -952,11 +951,11 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v6, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -966,11 +965,11 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -980,11 +979,11 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -995,6 +994,7 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1007,7 +1007,6 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z, <2 x half> %w) {
 ; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1032,11 +1031,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
 ; CI-NEXT:    v_sub_f32_e32 v2, v8, v2
 ; CI-NEXT:    v_sub_f32_e32 v3, v9, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1049,11 +1048,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1064,11 +1063,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
@@ -1080,6 +1079,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; GFX11-NEXT:    v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1095,7 +1095,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1116,11 +1115,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v6, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1132,11 +1131,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1148,11 +1147,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
@@ -1164,6 +1163,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1178,7 +1178,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z, <2 x half> %w) {
 ; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -1203,11 +1202,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
 ; CI-NEXT:    v_sub_f32_e32 v2, v8, v4
 ; CI-NEXT:    v_sub_f32_e32 v3, v9, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1220,11 +1219,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v2, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1235,11 +1234,11 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
@@ -1251,6 +1250,7 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; GFX11-NEXT:    v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1266,7 +1266,6 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fneg_var_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -1289,11 +1288,11 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_var_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -1304,11 +1303,11 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_var_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -1319,11 +1318,11 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_var_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -1335,6 +1334,7 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1346,7 +1346,6 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fneg_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1361,11 +1360,11 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1375,11 +1374,11 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1389,11 +1388,11 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1403,6 +1402,7 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1414,7 +1414,6 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fneg_inv2pi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1430,11 +1429,11 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_inv2pi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xb118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1444,11 +1443,11 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_inv2pi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xb118
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1458,11 +1457,11 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_inv2pi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1472,6 +1471,7 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1483,7 +1483,6 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1499,11 +1498,11 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1513,11 +1512,11 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3118
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1527,11 +1526,11 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1541,6 +1540,7 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1552,7 +1552,6 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: add_select_negk_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1563,11 +1562,11 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1577,11 +1576,11 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1590,11 +1589,11 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1604,6 +1603,7 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half -2.0, half -2.0>, <2 x half> <half -1.0, half -1.0>
@@ -1614,7 +1614,6 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_mov_b32_e32 v4, 0xc5800000
@@ -1627,11 +1626,11 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v2
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xec00
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xe800
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1641,11 +1640,11 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xec00
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe800
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1654,11 +1653,11 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xe800
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1668,6 +1667,7 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half -2048.0, half -2048.0>, <2 x half> <half -4096.0, half -4096.0>
@@ -1678,7 +1678,6 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: add_select_fneg_negk_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1689,11 +1688,11 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; CI-NEXT:    v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
 ; CI-NEXT:    v_sub_f32_e32 v1, v3, v1
 ; CI-NEXT:    v_sub_f32_e32 v0, v2, v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_negk_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1703,11 +1702,11 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; VI-NEXT:    v_sub_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1716,11 +1715,11 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_negk_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1730,6 +1729,7 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x half> <half -2.0, half -2.0>, <2 x half> <half -1.0, half -1.0>
@@ -1741,7 +1741,6 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_negk_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1756,11 +1755,11 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negk_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -1770,11 +1769,11 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negk_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -1784,11 +1783,11 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negk_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1798,6 +1797,7 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1809,7 +1809,6 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_fneg_posk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1824,11 +1823,11 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fneg_posk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1838,11 +1837,11 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fneg_posk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
@@ -1852,11 +1851,11 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fneg_posk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1866,6 +1865,7 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1877,7 +1877,6 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: add_select_posk_fneg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -1892,11 +1891,11 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -1.0, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v4, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v5, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_posk_fneg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -1906,11 +1905,11 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_posk_fneg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -1920,11 +1919,11 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_posk_fneg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1934,6 +1933,7 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -1945,7 +1945,6 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_negfabs_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -1968,11 +1967,11 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negfabs_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -1984,11 +1983,11 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negfabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2000,11 +1999,11 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negfabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -2018,6 +2017,7 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2031,7 +2031,6 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fabs_negfabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -2054,11 +2053,11 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_negfabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2070,11 +2069,11 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2086,11 +2085,11 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_negfabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -2104,6 +2103,7 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2117,7 +2117,6 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_neg_fabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -2140,11 +2139,11 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_neg_fabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2156,11 +2155,11 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_neg_fabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2172,11 +2171,11 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_neg_fabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -2190,6 +2189,7 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -2202,7 +2202,6 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_fabs_neg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
@@ -2225,11 +2224,11 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_add_f32_e32 v0, v0, v6
 ; CI-NEXT:    v_add_f32_e32 v1, v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_fabs_neg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2241,11 +2240,11 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_fabs_neg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2257,11 +2256,11 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_fabs_neg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -2275,6 +2274,7 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2287,7 +2287,6 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_neg_negfabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -2306,11 +2305,11 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v6, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_neg_negfabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
@@ -2321,11 +2320,11 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_neg_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
@@ -2336,11 +2335,11 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_neg_negfabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
@@ -2352,6 +2351,7 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fneg.x = fneg <2 x half> %x
@@ -2365,7 +2365,6 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
 ; CI-LABEL: add_select_negfabs_neg_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -2384,11 +2383,11 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; CI-NEXT:    v_sub_f32_e32 v0, v6, v0
 ; CI-NEXT:    v_sub_f32_e32 v1, v7, v1
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: add_select_negfabs_neg_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -2399,11 +2398,11 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_negfabs_neg_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -2414,11 +2413,11 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: add_select_negfabs_neg_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
@@ -2430,6 +2429,7 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2443,7 +2443,6 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: mul_select_negfabs_posk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -2462,11 +2461,11 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v2, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_posk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2477,11 +2476,11 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_select_negfabs_posk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2492,11 +2491,11 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negfabs_posk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2507,6 +2506,7 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2519,7 +2519,6 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: mul_select_posk_negfabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -2538,11 +2537,11 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 4.0, v2, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_posk_negfabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -2553,11 +2552,11 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -2568,11 +2567,11 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2583,6 +2582,7 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2595,7 +2595,6 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: mul_select_negfabs_negk_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -2614,11 +2613,11 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v2, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negfabs_negk_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2629,11 +2628,11 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_select_negfabs_negk_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2644,11 +2643,11 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negfabs_negk_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2659,6 +2658,7 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2671,7 +2671,6 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-LABEL: mul_select_negk_negfabs_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -2690,11 +2689,11 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; CI-NEXT:    v_cndmask_b32_e32 v0, -4.0, v2, vcc
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: mul_select_negk_negfabs_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -2705,11 +2704,11 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -2720,11 +2719,11 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2735,6 +2734,7 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
@@ -2751,7 +2751,6 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2770,11 +2769,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-SAFE-NEXT:    v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_add_f16_e32 v2, 4.0, v2
@@ -2788,11 +2787,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
@@ -2803,11 +2802,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2818,11 +2817,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2833,11 +2832,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NSZ-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v2, -4.0, v2
@@ -2848,11 +2847,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
-; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2862,11 +2861,11 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -2876,6 +2875,7 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %add = fadd <2 x half> %x, <half 4.0, half 4.0>
@@ -2887,7 +2887,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2906,11 +2905,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-SAFE-NEXT:    v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_add_f16_e32 v2, -4.0, v2
@@ -2924,11 +2923,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
 ; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
@@ -2939,11 +2938,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2954,11 +2953,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2969,11 +2968,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; VI-NSZ-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NSZ-NEXT:    v_sub_f16_e32 v2, 4.0, v2
@@ -2984,11 +2983,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
-; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -2998,11 +2997,11 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -3012,6 +3011,7 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %add = fsub <2 x half> %x, <half 4.0, half 4.0>
@@ -3023,7 +3023,6 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-LABEL: select_fneg_posk_src_mul_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -3034,11 +3033,11 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: select_fneg_posk_src_mul_v2f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
 ; VI-NEXT:    v_mul_f16_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e32 v2, -4.0, v2
@@ -3049,11 +3048,11 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_posk_src_mul_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -3063,11 +3062,11 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_posk_src_mul_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -3077,6 +3076,7 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %mul = fmul <2 x half> %x, <half 4.0, half 4.0>
@@ -3088,7 +3088,6 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
 ; CI-LABEL: select_fneg_posk_src_fma_v2f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -3111,11 +3110,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
 ; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fma_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-SAFE-NEXT:    v_fma_f16 v4, v5, 4.0, v4
@@ -3131,11 +3130,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
 ; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
@@ -3146,11 +3145,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3161,11 +3160,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NSZ-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
@@ -3177,11 +3176,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
-; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -3191,11 +3190,11 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -3205,6 +3204,7 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
@@ -3216,7 +3216,6 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3241,11 +3240,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
 ; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-SAFE-NEXT:    v_fma_f16 v4, v5, 4.0, v4
@@ -3261,11 +3260,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
 ; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
@@ -3276,11 +3275,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX11-SAFE:       ; %bb.0:
-; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3291,11 +3290,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -3312,11 +3311,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
 ; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-NSZ:       ; %bb.0:
-; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NSZ-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
@@ -3328,11 +3327,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
-; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -3342,11 +3341,11 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX11-NSZ:       ; %bb.0:
-; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
@@ -3356,6 +3355,7 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
index c9cc773a5629940..f9056258916a9bf 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-multiple-use.ll
@@ -10,7 +10,6 @@
 define i32 @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ds_read_b32 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -18,6 +17,7 @@ define i32 @f() {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = load i32, ptr addrspace(3) null, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 16550fc9588aef0..929032f5495f85c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -4,9 +4,9 @@
 define void @child_function() #0 {
 ; GCN-LABEL: child_function:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "~{vcc}" () #0
   ret void
@@ -15,9 +15,9 @@ define void @child_function() #0 {
 define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
@@ -309,9 +309,9 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-LABEL: spill_to_lowest_available_vgpr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
@@ -601,7 +601,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
 define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-LABEL: spill_sgpr_with_sgpr_uses:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -860,7 +860,7 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
@@ -908,7 +908,6 @@ ret:
 define void @spill_sgpr_with_tail_call() #0 {
 ; GCN-LABEL: spill_sgpr_with_tail_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
@@ -1181,7 +1180,7 @@ define void @spill_sgpr_with_tail_call() #0 {
 define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; GCN-LABEL: spill_sgpr_no_free_vgpr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1449,7 +1448,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a = load <4 x i32>, ptr addrspace(1) %in
   call void asm sideeffect "",
@@ -1490,9 +1489,9 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 define internal void @child_function_ipra() #0 {
 ; GCN-LABEL: child_function_ipra:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -1527,7 +1526,6 @@ define internal void @child_function_ipra() #0 {
 define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s18, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x7400
@@ -1644,6 +1642,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 s[16:17], exec
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_mov_b64 exec, 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456
 ; GCN-NEXT:    v_writelane_b32 v0, s30, 0
@@ -1808,7 +1807,6 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 define internal void @child_function_ipra_tail_call() #0 {
 ; GCN-LABEL: child_function_ipra_tail_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
@@ -2035,7 +2033,7 @@ define internal void @child_function_ipra_tail_call() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "",
   "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
@@ -2070,7 +2068,6 @@ define internal void @child_function_ipra_tail_call() #0 {
 define void @spill_sgpr_no_free_vgpr_ipra_tail_call() #0 {
 ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra_tail_call:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, child_function_ipra_tail_call at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, child_function_ipra_tail_call at rel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 4b02d00ddce1ef7..dbceffc3c82a466 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -4,7 +4,6 @@
 define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: v_shl_i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
 ; GCN-NEXT:    v_lshl_b64 v[5:6], v[2:3], v4
 ; GCN-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
@@ -21,6 +20,7 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 %lhs, %rhs
   ret i128 %shl
@@ -29,7 +29,6 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: v_lshr_i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
 ; GCN-NEXT:    v_lshr_b64 v[5:6], v[0:1], v4
 ; GCN-NEXT:    v_lshl_b64 v[7:8], v[2:3], v7
@@ -46,6 +45,7 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 
   %shl = lshr i128 %lhs, %rhs
@@ -55,7 +55,6 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
 define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: v_ashr_i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
 ; GCN-NEXT:    v_lshr_b64 v[5:6], v[0:1], v4
 ; GCN-NEXT:    v_lshl_b64 v[7:8], v[2:3], v7
@@ -73,6 +72,7 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 %lhs, %rhs
   ret i128 %shl
@@ -82,12 +82,12 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
 define i128 @v_shl_i128_vk(i128 %lhs) {
 ; GCN-LABEL: v_shl_i128_vk:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], 17
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
 ; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v0, 15
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 %lhs, 17
   ret i128 %shl
@@ -96,11 +96,11 @@ define i128 @v_shl_i128_vk(i128 %lhs) {
 define i128 @v_lshr_i128_vk(i128 %lhs) {
 ; GCN-LABEL: v_lshr_i128_vk:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v3, v2, 1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr i128 %lhs, 65
   ret i128 %shl
@@ -109,12 +109,12 @@ define i128 @v_lshr_i128_vk(i128 %lhs) {
 define i128 @v_ashr_i128_vk(i128 %lhs) {
 ; GCN-LABEL: v_ashr_i128_vk:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, v1
 ; GCN-NEXT:    v_lshl_b64 v[0:1], v[2:3], 31
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 33
 ; GCN-NEXT:    v_or_b32_e32 v0, v4, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 %lhs, 33
   ret i128 %shl
@@ -123,7 +123,6 @@ define i128 @v_ashr_i128_vk(i128 %lhs) {
 define i128 @v_shl_i128_kv(i128 %rhs) {
 ; GCN-LABEL: v_shl_i128_kv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 64, v0
 ; GCN-NEXT:    v_lshr_b64 v[2:3], 17, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, 64, v0
@@ -137,6 +136,7 @@ define i128 @v_shl_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 17, %rhs
   ret i128 %shl
@@ -145,7 +145,6 @@ define i128 @v_shl_i128_kv(i128 %rhs) {
 define i128 @v_lshr_i128_kv(i128 %rhs) {
 ; GCN-LABEL: v_lshr_i128_kv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0x41
 ; GCN-NEXT:    v_lshr_b64 v[1:2], s[4:5], v0
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
@@ -156,6 +155,7 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr i128 65, %rhs
   ret i128 %shl
@@ -164,7 +164,6 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
 define i128 @v_ashr_i128_kv(i128 %rhs) {
 ; GCN-LABEL: v_ashr_i128_kv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshr_b64 v[1:2], 33, v0
 ; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -173,6 +172,7 @@ define i128 @v_ashr_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 33, %rhs
   ret i128 %shl
@@ -281,7 +281,6 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: v_shl_v2i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
 ; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v8
@@ -322,6 +321,7 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
@@ -330,7 +330,6 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: v_lshr_v2i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
@@ -371,6 +370,7 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v7, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
@@ -379,7 +379,6 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: v_ashr_v2i128_vv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
 ; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
 ; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
@@ -422,6 +421,7 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 9cd13b3d4515099..dfb1523265db3e1 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -2122,7 +2122,6 @@ define amdgpu_kernel void @test_mul2(i32 %p) {
 define void @shl_or_k(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: shl_or_k:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2130,16 +2129,15 @@ define void @shl_or_k(ptr addrspace(1) %out, i32 %in) {
 ; SI-NEXT:    s_mov_b32 s5, s6
 ; SI-NEXT:    v_or_b32_e32 v2, 4, v2
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: shl_or_k:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_or_b32_e32 v2, 4, v2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: shl_or_k:
@@ -2163,7 +2161,6 @@ define void @shl_or_k(ptr addrspace(1) %out, i32 %in) {
 define void @shl_or_k_two_uses(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %in) {
 ; SI-LABEL: shl_or_k_two_uses:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s6, 0
 ; SI-NEXT:    v_or_b32_e32 v4, 1, v4
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
@@ -2172,17 +2169,16 @@ define void @shl_or_k_two_uses(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
 ; SI-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
 ; SI-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: shl_or_k_two_uses:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v4, 1, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
 ; VI-NEXT:    flat_store_dword v[0:1], v5
 ; VI-NEXT:    flat_store_dword v[2:3], v4
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: shl_or_k_two_uses:
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
index ec73b48c02ef4f5..bd74b63284a0fac 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_global.ll
@@ -4,7 +4,6 @@
 define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: shl_base_atomicrmw_global_ptr:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v6, 3
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14,7 +13,7 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(
 ; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v4
 ; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
 ; GCN-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
   %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
@@ -28,14 +27,13 @@ define void @shl_base_atomicrmw_global_ptr(ptr addrspace(1) %out, ptr addrspace(
 define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) #0 {
 ; GCN-LABEL: shl_base_global_ptr_global_atomic_fadd:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b64 v[0:1], 2, v[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v6, 0x42c80000
 ; GCN-NEXT:    global_atomic_add_f32 v[0:1], v6, off offset:512
 ; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 0x80, v4
 ; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
 ; GCN-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
   %cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index d34769ad0fcf0a6..12a8fc0d1df9c85 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -9,13 +9,13 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    .cfi_sections .debug_frame
 ; GCN-NEXT:    .cfi_startproc
 ; GCN-NEXT:  ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
 ; GCN-NEXT:    s_cbranch_execnz .LBB0_3
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index d1f05358ff13af6..f86d05531a63725 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -211,17 +211,17 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 define double @v_sint_to_fp_i8_to_f64(i8 %in) {
 ; CI-LABEL: v_sint_to_fp_i8_to_f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], v0
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_sint_to_fp_i8_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fp = sitofp i8 %in to double
   ret double %fp
@@ -264,13 +264,12 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_select_sint_to_fp_i1_vals_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double -1.0, double 0.0
@@ -315,13 +314,12 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_select_sint_to_fp_i1_vals_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
@@ -333,13 +331,12 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
 define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_swap_select_sint_to_fp_i1_vals_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0xbff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double -1.0
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 077377ae2b2dc51..d74a9b33d94f19a 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1918,38 +1918,41 @@ latch:
 define void @skip_mode_switch(i32 %arg) {
 ; WAVE64-LABEL: skip_mode_switch:
 ; WAVE64:       ; %bb.0: ; %entry
-; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; WAVE64-NEXT:    s_waitcnt expcnt(0)
 ; WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; WAVE64-NEXT:    s_cbranch_execz .LBB16_2
 ; WAVE64-NEXT:  ; %bb.1: ; %bb.0
 ; WAVE64-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
 ; WAVE64-NEXT:  .LBB16_2: ; %bb.1
 ; WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE64-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-WAVE32-LABEL: skip_mode_switch:
 ; GFX10-WAVE32:       ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-WAVE32-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb.0
 ; GFX10-WAVE32-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
 ; GFX10-WAVE32-NEXT:  .LBB16_2: ; %bb.1
 ; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-WAVE32-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: skip_mode_switch:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb.0
 ; GFX11-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3
 ; GFX11-NEXT:  .LBB16_2: ; %bb.1
 ; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cmp = icmp eq i32 %arg, 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
index 7ad3520376acfc4..55b958a327d9710 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll
@@ -112,7 +112,6 @@ entry:
 define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
 ; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
@@ -130,11 +129,11 @@ define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
@@ -152,6 +151,7 @@ define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@@ -410,7 +410,6 @@ entry:
 define void @test_inst_offset_function() {
 ; MUBUF-LABEL: test_inst_offset_function:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
@@ -419,12 +418,11 @@ define void @test_inst_offset_function() {
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: test_inst_offset_function:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
@@ -433,7 +431,7 @@ define void @test_inst_offset_function() {
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; Occupy enough bytes of scratch, so the offset of the spill of %a
@@ -459,7 +457,6 @@ entry:
 define void @test_sgpr_offset_function() {
 ; MUBUF-LABEL: test_sgpr_offset_function:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_add_i32 s4, s32, 0x40100
@@ -470,12 +467,11 @@ define void @test_sgpr_offset_function() {
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
-; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: test_sgpr_offset_function:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1004
@@ -486,7 +482,7 @@ define void @test_sgpr_offset_function() {
 ; FLATSCR-NEXT:    scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
@@ -509,7 +505,6 @@ entry:
 define void @test_sgpr_offset_subregs_function() {
 ; MUBUF-LABEL: test_sgpr_offset_subregs_function:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
@@ -527,11 +522,11 @@ define void @test_sgpr_offset_subregs_function() {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; v[0:1]
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
@@ -544,6 +539,7 @@ define void @test_sgpr_offset_subregs_function() {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; v[0:1]
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; We want to test the spill of the last subreg of %a is the highest
@@ -573,7 +569,6 @@ entry:
 define void @test_inst_offset_subregs_function() {
 ; MUBUF-LABEL: test_inst_offset_subregs_function:
 ; MUBUF:       ; %bb.0: ; %entry
-; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
@@ -593,11 +588,11 @@ define void @test_inst_offset_subregs_function() {
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; v[0:1]
 ; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; FLATSCR-LABEL: test_inst_offset_subregs_function:
 ; FLATSCR:       ; %bb.0: ; %entry
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
@@ -610,6 +605,7 @@ define void @test_inst_offset_subregs_function() {
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; v[0:1]
 ; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index 81dd2c4457b2fbd..b8413e4aabeb2d6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -9,7 +9,7 @@
 define void @test() {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0: ; %bb.0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt expcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
@@ -71,7 +71,7 @@ define void @test() {
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb.0:
   br label %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
index c1c69ce568a9c40..f22c6b9707fb70d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.writelane(i32, i32, i32)
 define void @sgpr_spill_writelane() {
 ; GCN-LABEL: sgpr_spill_writelane:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
@@ -20,7 +20,7 @@ define void @sgpr_spill_writelane() {
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "~{s35}"()
   ret void
@@ -29,12 +29,11 @@ define void @sgpr_spill_writelane() {
 define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) {
 ; GCN-LABEL: device_writelane_intrinsic:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v3, 15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v2
 ; GCN-NEXT:    v_writelane_b32 v3, s4, 23
 ; GCN-NEXT:    global_store_dword v[0:1], v3, off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src, i32 23, i32 15)
   store i32 %writelane, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59f1..a30f0c83700d1f2 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -4,7 +4,6 @@
 define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-LABEL: test_srem_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x1f5a814b
 ; CHECK-NEXT:    s_mov_b32 s5, 0x52bf5b
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
@@ -12,6 +11,7 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fffffff, v0
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i29 %X, 99
   %cmp = icmp eq i29 %srem, 0
@@ -21,7 +21,6 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK-LABEL: test_srem_even:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_i32 v1, v0, 0, 4
 ; CHECK-NEXT:    v_mul_i32_i24_e32 v1, 3, v1
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
@@ -33,6 +32,7 @@ define i1 @test_srem_even(i4 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i4 %X, 6
   %cmp = icmp eq i4 %srem, 1
@@ -42,7 +42,6 @@ define i1 @test_srem_even(i4 %X) nounwind {
 define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; CHECK-LABEL: test_srem_pow2_setne:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_i32 v1, v0, 0, 6
 ; CHECK-NEXT:    v_bfe_u32 v1, v1, 9, 2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v0, v1
@@ -51,6 +50,7 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 63, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem i6 %X, 4
   %cmp = icmp ne i6 %srem, 0
@@ -60,7 +60,6 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
 define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-LABEL: test_srem_vec:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_bfe_i32 v3, v2, 0, 31
 ; CHECK-NEXT:    v_bfe_i32 v4, v1, 0, 31
 ; CHECK-NEXT:    v_bfe_i32 v5, v0, 0, 31
@@ -94,6 +93,7 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %srem = srem <3 x i31> %X, <i31 9, i31 9, i31 -9>
   %cmp = icmp ne <3 x i31> %srem, <i31 3, i31 -3, i31 3>
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index bfb409438757c73..5908071afe54d07 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -216,7 +216,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_srem:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v4, vcc
@@ -334,11 +333,11 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
@@ -370,6 +369,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v9, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v8, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -440,6 +440,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, %y
   ret i64 %result
@@ -602,7 +603,6 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %
 define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_srem24_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 40
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v2
@@ -622,11 +622,11 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem24_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[2:3], v[2:3], 40
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
@@ -646,6 +646,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
@@ -1471,7 +1472,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
@@ -1569,11 +1569,11 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -1594,6 +1594,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB11_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1658,6 +1659,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 24, %x
   ret i64 %result
@@ -1666,7 +1668,6 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
@@ -1760,11 +1761,11 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -1786,6 +1787,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1851,6 +1853,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 32768, %x
   ret i64 %result
@@ -1859,7 +1862,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
@@ -1867,11 +1869,11 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
@@ -1892,6 +1894,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB13_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1953,6 +1956,7 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, 32768
   ret i64 %result
@@ -2099,7 +2103,6 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x
 define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem24_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -2117,11 +2120,11 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem24_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -2139,6 +2142,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
@@ -2148,7 +2152,6 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -2166,11 +2169,11 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem24_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -2188,6 +2191,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 32768, %x.shr
@@ -2197,7 +2201,6 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_srem24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
@@ -2205,11 +2208,11 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_srem24_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ashr_i64 v[0:1], v[0:1], 40
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
@@ -2226,6 +2229,7 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 32768
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 71017f15e3c6d14..1d0ef60525ea59c 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -8,39 +8,39 @@
 define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_movk_i32 s4, 0xff80
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x7f
 ; GFX6-NEXT:    v_med3_i32 v0, v0, s4, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_min_i16_e32 v0, 0x7f, v0
 ; GFX8-NEXT:    v_max_i16_e32 v0, 0xff80, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 8, v0
 ; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
 ; GFX10PLUS-NEXT:    v_ashrrev_i16 v0, 8, v0
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -49,18 +49,17 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
 define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX6-NEXT:    v_med3_i32 v0, v0, s4, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
@@ -68,18 +67,19 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -88,7 +88,6 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
@@ -96,11 +95,11 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
@@ -108,18 +107,19 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -128,7 +128,6 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
@@ -143,11 +142,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v2
@@ -166,18 +165,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -186,7 +186,6 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
@@ -206,11 +205,11 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
@@ -236,20 +235,21 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v3i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
   ret <3 x i16> %result
@@ -258,7 +258,6 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
@@ -283,11 +282,11 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
@@ -324,20 +323,21 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -347,7 +347,6 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
@@ -362,11 +361,11 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
@@ -381,20 +380,21 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -403,7 +403,6 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
@@ -425,11 +424,11 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
@@ -451,22 +450,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -475,7 +475,6 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
@@ -504,11 +503,11 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
@@ -537,24 +536,25 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_sub_i32 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -563,7 +563,6 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v8i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
@@ -620,11 +619,11 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v8i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
@@ -681,11 +680,11 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v8 clamp
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v9 clamp
 ; GFX9-NEXT:    v_sub_i32 v2, v2, v10 clamp
@@ -694,11 +693,11 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_i32 v5, v5, v13 clamp
 ; GFX9-NEXT:    v_sub_i32 v6, v6, v14 clamp
 ; GFX9-NEXT:    v_sub_i32 v7, v7, v15 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_ssubsat_v8i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v0, v0, v8 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v1, v1, v9 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v2, v2, v10 clamp
@@ -707,6 +706,7 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v5, v5, v13 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v6, v6, v14 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_i32 v7, v7, v15 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
   ret <8 x i32> %result
@@ -715,7 +715,6 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
@@ -830,11 +829,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
@@ -949,11 +948,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_sub_i32 v1, v1, v17 clamp
@@ -972,11 +971,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_i32 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_i32 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
@@ -995,11 +994,11 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
@@ -1018,6 +1017,7 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -1027,7 +1027,6 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_ssubsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -1037,11 +1036,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -1051,11 +1050,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
@@ -1065,11 +1064,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
@@ -1079,11 +1078,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_i64:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
@@ -1092,6 +1091,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index 78871385f8ffcb2..68fda569e1d07c6 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -20,50 +20,50 @@ define hidden void @stack_passed_argument([32 x i32], i32) {
 define void @func_store_stacksave() {
 ; WAVE32-OPT-LABEL: func_store_stacksave:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s4, s32, 5
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s4
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_store_stacksave:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_lshr_b32 s4, s32, 6
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s4
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_store_stacksave:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-O0-NEXT:    s_lshr_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    ;;#ASMSTART
 ; WAVE32-O0-NEXT:    ; use s4
 ; WAVE32-O0-NEXT:    ;;#ASMEND
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_store_stacksave:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE64-O0-NEXT:    s_lshr_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    ;;#ASMSTART
 ; WAVE64-O0-NEXT:    ; use s4
 ; WAVE64-O0-NEXT:    ;;#ASMEND
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_store_stacksave:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMSTART
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s4
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   %stacksave = call ptr addrspace(5) @llvm.stacksave.p5()
   call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %stacksave)
@@ -202,9 +202,9 @@ define amdgpu_kernel void @kernel_store_stacksave_nocall() {
 define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-OPT-LABEL: func_stacksave_nonentry_block:
 ; WAVE32-OPT:       ; %bb.0: ; %bb0
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    v_and_b32_e32 v0, 1, v0
 ; WAVE32-OPT-NEXT:    s_mov_b32 s4, exec_lo
+; WAVE32-OPT-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-OPT-NEXT:    v_cmpx_eq_u32_e32 1, v0
 ; WAVE32-OPT-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE32-OPT-NEXT:  ; %bb.1: ; %bb1
@@ -214,13 +214,14 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
 ; WAVE32-OPT-NEXT:  .LBB4_2: ; %bb2
 ; WAVE32-OPT-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_nonentry_block:
 ; WAVE64-OPT:       ; %bb.0: ; %bb0
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    v_and_b32_e32 v0, 1, v0
 ; WAVE64-OPT-NEXT:    s_mov_b64 s[4:5], exec
+; WAVE64-OPT-NEXT:    s_waitcnt expcnt(0)
 ; WAVE64-OPT-NEXT:    v_cmpx_eq_u32_e32 1, v0
 ; WAVE64-OPT-NEXT:    s_cbranch_execz .LBB4_2
 ; WAVE64-OPT-NEXT:  ; %bb.1: ; %bb1
@@ -230,11 +231,12 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
 ; WAVE64-OPT-NEXT:  .LBB4_2: ; %bb2
 ; WAVE64-OPT-NEXT:    s_or_b64 exec, exec, s[4:5]
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_nonentry_block:
 ; WAVE32-O0:       ; %bb.0: ; %bb0
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-O0-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
@@ -271,12 +273,12 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_nonentry_block:
 ; WAVE64-O0:       ; %bb.0: ; %bb0
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-O0-NEXT:    s_waitcnt expcnt(0)
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
@@ -315,12 +317,12 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_nonentry_block:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0: ; %bb0
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
@@ -345,7 +347,7 @@ define void @func_stacksave_nonentry_block(i1 %cond) {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
 bb0:
   br i1 %cond, label %bb1, label %bb2
@@ -362,38 +364,38 @@ bb2:
 define void @func_stackrestore_poison() {
 ; WAVE32-OPT-LABEL: func_stackrestore_poison:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_lshl_b32 s32, s4, 5
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stackrestore_poison:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_lshl_b32 s32, s4, 6
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stackrestore_poison:
 ; WAVE32-O0:       ; %bb.0:
 ; WAVE32-O0-NEXT:    ; implicit-def: $sgpr4
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stackrestore_poison:
 ; WAVE64-O0:       ; %bb.0:
 ; WAVE64-O0-NEXT:    ; implicit-def: $sgpr4
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stackrestore_poison:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
 ; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $sgpr4
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) poison)
   ret void
@@ -402,38 +404,38 @@ define void @func_stackrestore_poison() {
 define void @func_stackrestore_null() {
 ; WAVE32-OPT-LABEL: func_stackrestore_null:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_mov_b32 s32, 0
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stackrestore_null:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_mov_b32 s32, 0
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stackrestore_null:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, 0
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stackrestore_null:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, 0
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stackrestore_null:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) null)
   ret void
@@ -442,38 +444,38 @@ define void @func_stackrestore_null() {
 define void @func_stackrestore_neg1() {
 ; WAVE32-OPT-LABEL: func_stackrestore_neg1:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_movk_i32 s32, 0xffe0
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stackrestore_neg1:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_movk_i32 s32, 0xffc0
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stackrestore_neg1:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, -1
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stackrestore_neg1:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, -1
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stackrestore_neg1:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) inttoptr (i32 -1 to ptr addrspace(5)))
   ret void
@@ -482,38 +484,38 @@ define void @func_stackrestore_neg1() {
 define void @func_stackrestore_42() {
 ; WAVE32-OPT-LABEL: func_stackrestore_42:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_movk_i32 s32, 0x540
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stackrestore_42:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_movk_i32 s32, 0xa80
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stackrestore_42:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, 42
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stackrestore_42:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, 42
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stackrestore_42:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, 42
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) inttoptr (i32 42 to ptr addrspace(5)))
   ret void
@@ -532,23 +534,23 @@ define void @func_stacksave_stackrestore() {
 ;
 ; WAVE32-O0-LABEL: func_stacksave_stackrestore:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_stackrestore:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   %stacksave = call ptr addrspace(5) @llvm.stacksave.p5()
   call void @llvm.stackrestore.p5(ptr addrspace(5) %stacksave)
@@ -558,53 +560,53 @@ define void @func_stacksave_stackrestore() {
 define void @func_stacksave_stackrestore_use() {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_use:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s4, s32, 5
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s4
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_use:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_lshr_b32 s4, s32, 6
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s4
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_stackrestore_use:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-O0-NEXT:    s_lshr_b32 s5, s4, 5
 ; WAVE32-O0-NEXT:    ;;#ASMSTART
 ; WAVE32-O0-NEXT:    ; use s5
 ; WAVE32-O0-NEXT:    ;;#ASMEND
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_stackrestore_use:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE64-O0-NEXT:    s_lshr_b32 s5, s4, 6
 ; WAVE64-O0-NEXT:    ;;#ASMSTART
 ; WAVE64-O0-NEXT:    ; use s5
 ; WAVE64-O0-NEXT:    ;;#ASMEND
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_use:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s5, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMSTART
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   %stacksave = call ptr addrspace(5) @llvm.stacksave.p5()
   call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %stacksave)
@@ -667,53 +669,53 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_use() {
 define void @func_stacksave_stackrestore_voffset(i32 %offset) {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_lshr_b32 s4, s32, 5
 ; WAVE32-OPT-NEXT:    v_add_nc_u32_e32 v0, s4, v0
 ; WAVE32-OPT-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-OPT-NEXT:    s_lshl_b32 s32, s4, 5
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_lshr_b32 s4, s32, 6
 ; WAVE64-OPT-NEXT:    v_add_nc_u32_e32 v0, s4, v0
 ; WAVE64-OPT-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE64-OPT-NEXT:    s_lshl_b32 s32, s4, 6
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-O0-NEXT:    s_lshr_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    v_add_nc_u32_e64 v0, s4, v0
 ; WAVE32-O0-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s4, s32
 ; WAVE64-O0-NEXT:    s_lshr_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    v_add_nc_u32_e64 v0, s4, v0
 ; WAVE64-O0-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_voffset:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s4, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    v_add_nc_u32_e64 v0, s4, v0
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   %stacksave = call ptr addrspace(5) @llvm.stacksave.p5()
   %gep = getelementptr i8, ptr addrspace(5) %stacksave, i32 %offset
@@ -724,40 +726,40 @@ define void @func_stacksave_stackrestore_voffset(i32 %offset) {
 define void @func_stacksave_vgpr(ptr addrspace(5) %stack) {
 ; WAVE32-OPT-LABEL: func_stacksave_vgpr:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-OPT-NEXT:    s_lshl_b32 s32, s4, 5
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_vgpr:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE64-OPT-NEXT:    s_lshl_b32 s32, s4, 6
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_vgpr:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-O0-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_vgpr:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE64-O0-NEXT:    s_lshl_b32 s4, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_vgpr:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readfirstlane_b32 s4, v0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s4, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) %stack)
   ret void
@@ -766,35 +768,35 @@ define void @func_stacksave_vgpr(ptr addrspace(5) %stack) {
 define amdgpu_gfx void @func_stacksave_sgpr(ptr addrspace(5) inreg %stack) {
 ; WAVE32-OPT-LABEL: func_stacksave_sgpr:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_lshl_b32 s32, s4, 5
+; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_sgpr:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_lshl_b32 s32, s4, 6
+; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-O0-LABEL: func_stacksave_sgpr:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_lshl_b32 s34, s4, 5
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s34
+; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE64-O0-LABEL: func_stacksave_sgpr:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_lshl_b32 s34, s4, 6
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s34
+; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_sgpr:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshl_b32 s34, s4, 5
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s34
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.stackrestore.p5(ptr addrspace(5) %stack)
   ret void
@@ -1273,9 +1275,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
 define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-OPT:       ; %bb.0:
-; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-OPT-NEXT:    s_mov_b32 s8, s33
 ; WAVE32-OPT-NEXT:    s_mov_b32 s33, s32
+; WAVE32-OPT-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-OPT-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-OPT-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; WAVE32-OPT-NEXT:    s_mov_b32 exec_lo, s4
@@ -1308,9 +1310,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ;
 ; WAVE64-OPT-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE64-OPT:       ; %bb.0:
-; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-OPT-NEXT:    s_mov_b32 s8, s33
 ; WAVE64-OPT-NEXT:    s_mov_b32 s33, s32
+; WAVE64-OPT-NEXT:    s_waitcnt expcnt(0)
 ; WAVE64-OPT-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-OPT-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; WAVE64-OPT-NEXT:    s_mov_b64 exec, s[4:5]
@@ -1343,9 +1345,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ;
 ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-O0:       ; %bb.0:
-; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-O0-NEXT:    s_mov_b32 s25, s33
 ; WAVE32-O0-NEXT:    s_mov_b32 s33, s32
+; WAVE32-O0-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s16, -1
 ; WAVE32-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; WAVE32-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
@@ -1462,9 +1464,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ;
 ; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE64-O0:       ; %bb.0:
-; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE64-O0-NEXT:    s_mov_b32 s19, s33
 ; WAVE64-O0-NEXT:    s_mov_b32 s33, s32
+; WAVE64-O0-NEXT:    s_waitcnt expcnt(0)
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[16:17], -1
 ; WAVE64-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; WAVE64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
@@ -1581,9 +1583,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ;
 ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
 ; WAVE32-WWM-PREALLOC:       ; %bb.0:
-; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s24, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s32
+; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt expcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s16, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f88aaf389ca9aeb..913bee7044b9f34 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -8,40 +8,36 @@
 define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 ; CIVI-LABEL: local_store_i56:
 ; CIVI:       ; %bb.0:
-; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_mov_b32 m0, -1
 ; CIVI-NEXT:    ds_write_b16 v0, v2 offset:4
 ; CIVI-NEXT:    ds_write_b32 v0, v1
 ; CIVI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; CIVI-NEXT:    ds_write_b8 v0, v1 offset:6
-; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: local_store_i56:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
 ; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
 ; GFX9-NEXT:    ds_write_b32 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: local_store_i56:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
 ; GFX10-NEXT:    ds_write_b16 v0, v2 offset:4
 ; GFX10-NEXT:    ds_write_b32 v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: local_store_i56:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:6
 ; GFX11-NEXT:    ds_store_b16 v0, v2 offset:4
 ; GFX11-NEXT:    ds_store_b32 v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i56 %arg, ptr addrspace(3) %ptr, align 8
   ret void
@@ -305,35 +301,31 @@ define amdgpu_kernel void @local_store_i65(ptr addrspace(3) %ptr, i65 %arg) #0 {
 define void @local_store_i13(ptr addrspace(3) %ptr, i13 %arg) #0 {
 ; CIVI-LABEL: local_store_i13:
 ; CIVI:       ; %bb.0:
-; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
 ; CIVI-NEXT:    s_mov_b32 m0, -1
 ; CIVI-NEXT:    ds_write_b16 v0, v1
-; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: local_store_i13:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
 ; GFX9-NEXT:    ds_write_b16 v0, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: local_store_i13:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
 ; GFX10-NEXT:    ds_write_b16 v0, v1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: local_store_i13:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
 ; GFX11-NEXT:    ds_store_b16 v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i13 %arg, ptr addrspace(3) %ptr, align 8
   ret void
@@ -342,39 +334,35 @@ define void @local_store_i13(ptr addrspace(3) %ptr, i13 %arg) #0 {
 define void @local_store_i17(ptr addrspace(3) %ptr, i17 %arg) #0 {
 ; CIVI-LABEL: local_store_i17:
 ; CIVI:       ; %bb.0:
-; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_mov_b32 m0, -1
 ; CIVI-NEXT:    ds_write_b16 v0, v1
 ; CIVI-NEXT:    v_bfe_u32 v1, v1, 16, 1
 ; CIVI-NEXT:    ds_write_b8 v0, v1 offset:2
-; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
+; CIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: local_store_i17:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    ds_write_b16 v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x1ffff, v1
 ; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: local_store_i17:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0x1ffff, v1
 ; GFX10-NEXT:    ds_write_b16 v0, v1
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: local_store_i17:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x1ffff, v1
 ; GFX11-NEXT:    ds_store_b16 v0, v1
 ; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i17 %arg, ptr addrspace(3) %ptr, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
index f3be926f76befd5..fd7666ec646e795 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
@@ -8,14 +8,14 @@
 define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -24,14 +24,14 @@ define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 {
 define half @v_constained_fadd_f16_fpexcept_ignore(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f16_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret half %val
@@ -40,14 +40,14 @@ define half @v_constained_fadd_f16_fpexcept_ignore(half %x, half %y) #0 {
 define half @v_constained_fadd_f16_fpexcept_maytrap(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f16_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret half %val
@@ -56,22 +56,22 @@ define half @v_constained_fadd_f16_fpexcept_maytrap(half %x, half %y) #0 {
 define <2 x half> @v_constained_fadd_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -80,22 +80,22 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_strict(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fadd_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_ignore:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_ignore:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x half> %val
@@ -104,22 +104,22 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fadd_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x half> %val
@@ -128,25 +128,25 @@ define <2 x half> @v_constained_fadd_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x
 define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX9-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_v3f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX10PLUS-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -156,7 +156,6 @@ define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_f16_e32 v1, v1, v3
@@ -164,33 +163,33 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f16_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
@@ -201,6 +200,7 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX11-NEXT:    v_add_f16_e32 v3, v7, v4
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll
index 2aecf5fd8753c95..ca64bc5b9cad928 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll
@@ -6,14 +6,14 @@
 define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -22,14 +22,14 @@ define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
 define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -38,14 +38,14 @@ define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
 define float @v_constained_fadd_f32_fpexcept_maytrap(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret float %val
@@ -54,22 +54,22 @@ define float @v_constained_fadd_f32_fpexcept_maytrap(float %x, float %y) #0 {
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -78,22 +78,22 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_ignore:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %val
@@ -102,22 +102,22 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f32_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_maytrap:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x float> %val
@@ -126,25 +126,25 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_maytrap(<2 x float> %x, <2
 define <3 x float> @v_constained_fadd_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
 ; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v3f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fadd_v3f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_add_f32 v0, v0, v3 :: v_dual_add_f32 v1, v1, v4
 ; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -168,14 +168,14 @@ define amdgpu_ps float @s_constained_fadd_f32_fpexcept_strict(float inreg %x, fl
 define float @v_constained_fadd_f32_fpexcept_strict_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f32_e64 v0, |v0|, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -185,14 +185,14 @@ define float @v_constained_fadd_f32_fpexcept_strict_fabs_lhs(float %x, float %y)
 define float @v_constained_fadd_f32_fpexcept_strict_fabs_rhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_rhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e64 v0, v0, |v1|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_rhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_add_f32_e64 v0, v0, |v1|
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y) #0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -202,14 +202,14 @@ define float @v_constained_fadd_f32_fpexcept_strict_fabs_rhs(float %x, float %y)
 define float @v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e64 v0, v1, |v0|
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %neg.fabs.x = fneg float %fabs.x
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
index faa0131c88c2d20..9ccbc416877248f 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll
@@ -6,14 +6,14 @@
 define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %val
@@ -22,14 +22,14 @@ define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 {
 define double @v_constained_fadd_f64_fpexcept_ignore(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret double %val
@@ -38,14 +38,14 @@ define double @v_constained_fadd_f64_fpexcept_ignore(double %x, double %y) #0 {
 define double @v_constained_fadd_f64_fpexcept_maytrap(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fadd_f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret double %val
@@ -54,16 +54,16 @@ define double @v_constained_fadd_f64_fpexcept_maytrap(double %x, double %y) #0 {
 define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -72,16 +72,16 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_strict(<2 x double> %x, <2
 define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -90,16 +90,16 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_ignore(<2 x double> %x, <2
 define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v2f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v2f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val
@@ -108,18 +108,18 @@ define <2 x double> @v_constained_fadd_v2f64_fpexcept_maytrap(<2 x double> %x, <
 define <3 x double> @v_constained_fadd_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fadd_v3f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], v[6:7]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], v[8:9]
 ; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fadd_v3f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[6:7]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], v[8:9]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x double> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index 407bb002483ec89..b6ff9f6b997b11b 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -7,20 +7,20 @@
 define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -29,13 +29,12 @@ define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0
 define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -43,18 +42,19 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -63,14 +63,13 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
 define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
@@ -79,20 +78,21 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v6
 ; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX10-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_v3f16_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX11-NEXT:    v_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -101,7 +101,6 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
 define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -115,11 +114,11 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -134,11 +133,11 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -151,11 +150,11 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX10-NEXT:    v_fmac_f16_e32 v9, v11, v10
 ; GFX10-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
@@ -168,6 +167,7 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX11-NEXT:    v_fmac_f16_e32 v9, v11, v10
 ; GFX11-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
@@ -176,20 +176,20 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg half %z
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -199,20 +199,20 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z
 define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg half %x
   %neg.y = fneg half %y
@@ -223,20 +223,20 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha
 define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, half %z) #0 {
 ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call half @llvm.fabs.f16(half %x) #0
   %neg.y = call half @llvm.fabs.f16(half %y) #0
@@ -247,13 +247,12 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha
 define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
 ; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
@@ -261,18 +260,19 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_fma_f16 v0, -v0, -v1, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x half> %x
   %neg.y = fneg <2 x half> %y
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
index 922a4dcbea351e7..72a8537f2e24d3d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
@@ -6,14 +6,14 @@
 define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -22,16 +22,16 @@ define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z)
 define <2 x float> @v_constained_fma_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX10-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -40,18 +40,18 @@ define <2 x float> @v_constained_fma_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <3 x float> @v_constained_fma_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y, <3 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v3f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v3, v6
 ; GFX10-NEXT:    v_fma_f32 v1, v1, v4, v7
 ; GFX10-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float> %x, <3 x float> %y, <3 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -60,20 +60,20 @@ define <3 x float> @v_constained_fma_v3f32_fpexcept_strict(<3 x float> %x, <3 x
 define <4 x float> @v_constained_fma_v4f32_fpexcept_strict(<4 x float> %x, <4 x float> %y, <4 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v4f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v4f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v4, v8
 ; GFX10-NEXT:    v_fma_f32 v1, v1, v5, v9
 ; GFX10-NEXT:    v_fma_f32 v2, v2, v6, v10
 ; GFX10-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x float> %val
@@ -82,14 +82,14 @@ define <4 x float> @v_constained_fma_v4f32_fpexcept_strict(<4 x float> %x, <4 x
 define float @v_constained_fma_f32_fpexcept_strict_fneg(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg float %z
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -99,14 +99,14 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg(float %x, float %y, floa
 define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %neg.y = fneg float %y
@@ -117,14 +117,14 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y,
 define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y, float %z) #0 {
 ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call float @llvm.fabs.f32(float %x) #0
   %neg.y = call float @llvm.fabs.f32(float %y) #0
@@ -135,16 +135,16 @@ define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y,
 define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f32 v0, -v0, -v2, v4
 ; GCN-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f32 v0, -v0, -v2, v4
 ; GFX10-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x float> %x
   %neg.y = fneg <2 x float> %y
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
index 5cb293ab25806a7..8b4c01962b9b026 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
@@ -6,14 +6,14 @@
 define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %val
@@ -22,16 +22,16 @@ define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double
 define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -40,18 +40,18 @@ define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2
 define <3 x double> @v_constained_fma_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y, <3 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v3f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v3f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double> %x, <3 x double> %y, <3 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x double> %val
@@ -60,20 +60,20 @@ define <3 x double> @v_constained_fma_v3f64_fpexcept_strict(<3 x double> %x, <3
 define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4 x double> %y, <4 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v4f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GCN-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v4f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
 ; GFX10-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
 ; GFX10-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x double> %val
@@ -82,14 +82,14 @@ define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4
 define double @v_constained_fma_f64_fpexcept_strict_fneg(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg double %z
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -99,14 +99,14 @@ define double @v_constained_fma_f64_fpexcept_strict_fneg(double %x, double %y, d
 define double @v_constained_fma_f64_fpexcept_strict_fneg_fneg(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %neg.y = fneg double %y
@@ -117,14 +117,14 @@ define double @v_constained_fma_f64_fpexcept_strict_fneg_fneg(double %x, double
 define double @v_constained_fma_f64_fpexcept_strict_fabs_fabs(double %x, double %y, double %z) #0 {
 ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call double @llvm.fabs.f64(double %x) #0
   %neg.y = call double @llvm.fabs.f64(double %y) #0
@@ -135,16 +135,16 @@ define double @v_constained_fma_f64_fpexcept_strict_fabs_fabs(double %x, double
 define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
 ; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
 ; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
 ; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x double> %x
   %neg.y = fneg <2 x double> %y
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
index d798166a6783966..b48a80c90d9d110 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
@@ -17,14 +17,14 @@
 define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -33,14 +33,14 @@ define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 {
 define half @v_constained_fmul_f16_fpexcept_ignore(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f16_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret half %val
@@ -49,14 +49,14 @@ define half @v_constained_fmul_f16_fpexcept_ignore(half %x, half %y) #0 {
 define half @v_constained_fmul_f16_fpexcept_maytrap(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f16_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret half %val
@@ -65,30 +65,30 @@ define half @v_constained_fmul_f16_fpexcept_maytrap(half %x, half %y) #0 {
 define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -97,30 +97,30 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x half> %val
@@ -129,30 +129,30 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x half> %val
@@ -161,62 +161,62 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x
 define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
 ; GFX1-GISEL:       ; %bb.0:
-; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX1-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -226,7 +226,6 @@ define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
@@ -234,58 +233,58 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-SDAG-NEXT:    v_perm_b32 v1, v4, v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX8-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_mul_f16_e32 v2, v1, v3
 ; GFX8-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX10-SDAG-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
@@ -296,13 +295,14 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX11-SDAG-NEXT:    v_mul_f16_e32 v3, v7, v4
 ; GFX11-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SDAG-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX1-GISEL:       ; %bb.0:
-; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX1-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX1-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll
index 8df2834928395c7..b152b53fcdef7ba 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll
@@ -9,14 +9,14 @@
 define float @v_constained_fmul_f32_fpexcept_strict(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -25,14 +25,14 @@ define float @v_constained_fmul_f32_fpexcept_strict(float %x, float %y) #0 {
 define float @v_constained_fmul_f32_fpexcept_ignore(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -41,14 +41,14 @@ define float @v_constained_fmul_f32_fpexcept_ignore(float %x, float %y) #0 {
 define float @v_constained_fmul_f32_fpexcept_maytrap(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret float %val
@@ -57,22 +57,22 @@ define float @v_constained_fmul_f32_fpexcept_maytrap(float %x, float %y) #0 {
 define <2 x float> @v_constained_fmul_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -81,22 +81,22 @@ define <2 x float> @v_constained_fmul_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fmul_v2f32_fpexcept_ignore(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_ignore:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %val
@@ -105,22 +105,22 @@ define <2 x float> @v_constained_fmul_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fmul_v2f32_fpexcept_maytrap(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f32_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_maytrap:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x float> %val
@@ -129,25 +129,25 @@ define <2 x float> @v_constained_fmul_v2f32_fpexcept_maytrap(<2 x float> %x, <2
 define <3 x float> @v_constained_fmul_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v3f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fmul_v3f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v3 :: v_dual_mul_f32 v1, v1, v4
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -171,14 +171,14 @@ define amdgpu_ps float @s_constained_fmul_f32_fpexcept_strict(float inreg %x, fl
 define float @v_constained_fmul_f32_fpexcept_strict_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e64 v0, |v0|, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %val = call float @llvm.experimental.constrained.fmul.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -188,14 +188,14 @@ define float @v_constained_fmul_f32_fpexcept_strict_fabs_lhs(float %x, float %y)
 define float @v_constained_fmul_f32_fpexcept_strict_fabs_rhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_rhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, v0, |v1|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_rhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e64 v0, v0, |v1|
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y) #0
   %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -205,14 +205,14 @@ define float @v_constained_fmul_f32_fpexcept_strict_fabs_rhs(float %x, float %y)
 define float @v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %neg.fabs.x = fneg float %fabs.x
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
index 8c98a662c59cc3d..8215086c6317f76 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll
@@ -9,14 +9,14 @@
 define double @v_constained_fmul_f64_fpexcept_strict(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %val
@@ -25,14 +25,14 @@ define double @v_constained_fmul_f64_fpexcept_strict(double %x, double %y) #0 {
 define double @v_constained_fmul_f64_fpexcept_ignore(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret double %val
@@ -41,14 +41,14 @@ define double @v_constained_fmul_f64_fpexcept_ignore(double %x, double %y) #0 {
 define double @v_constained_fmul_f64_fpexcept_maytrap(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fmul_f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret double %val
@@ -57,16 +57,16 @@ define double @v_constained_fmul_f64_fpexcept_maytrap(double %x, double %y) #0 {
 define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -75,16 +75,16 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_strict(<2 x double> %x, <2
 define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -93,16 +93,16 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_ignore(<2 x double> %x, <2
 define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v2f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v2f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val
@@ -111,18 +111,18 @@ define <2 x double> @v_constained_fmul_v2f64_fpexcept_maytrap(<2 x double> %x, <
 define <3 x double> @v_constained_fmul_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fmul_v3f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GCN-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GCN-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fmul_v3f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], v[6:7]
 ; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], v[8:9]
 ; GFX10-NEXT:    v_mul_f64 v[4:5], v[4:5], v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x double> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 22bebb7ad26f532..82686a6f346fc16 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -9,8 +9,8 @@
 define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   ret float %result
@@ -19,26 +19,26 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
 define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half> %arg) #0 {
 ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX89-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX89-NEXT:    v_mov_b32_e32 v0, v2
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
   ret <2 x float>   %result
@@ -47,32 +47,32 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half
 define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half> %arg) #0 {
 ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; GFX89-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX89-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX89-NEXT:    v_mov_b32_e32 v1, v3
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
   ret <3 x float>   %result
@@ -81,8 +81,8 @@ define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half
 define double @v_constrained_fpext_f32_to_f64_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_f32_to_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.fpext.f64.f32(float %arg, metadata !"fpexcept.strict")
   ret double %result
@@ -91,10 +91,10 @@ define double @v_constrained_fpext_f32_to_f64_fpexcept_strict(float %arg) #0 {
 define <2 x double> @v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict(<2 x float> %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float> %arg, metadata !"fpexcept.strict")
   ret <2 x double>   %result
@@ -103,22 +103,22 @@ define <2 x double> @v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict(<2 x flo
 define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x float> %arg) #0 {
 ; GFX89-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX89-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX1011-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1011-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX1011-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX1011-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(<3 x float> %arg, metadata !"fpexcept.strict")
   ret <3 x double>   %result
@@ -127,9 +127,9 @@ define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x flo
 define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.fpext.f64.f16(half %arg, metadata !"fpexcept.strict")
   ret double %result
@@ -138,30 +138,30 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 {
 define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x half> %arg) #0 {
 ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX89-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
   ret <2 x double>   %result
@@ -170,29 +170,28 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal
 define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x half> %arg) #0 {
 ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
 ; GFX89:       ; %bb.0:
-; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX89-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
@@ -200,6 +199,7 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
 ; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
   ret <3 x double>   %result
@@ -208,9 +208,9 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal
 define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
 ; GCN-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   %neg.result = fneg float %result
@@ -220,8 +220,8 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0
 define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg half %arg
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %neg.arg, metadata !"fpexcept.strict")
@@ -231,8 +231,8 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0
 define double @v_constrained_fpext_fneg_f32_to_f64_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fpext_fneg_f32_to_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg float %arg
   %result = call double @llvm.experimental.constrained.fpext.f64.f32(float %arg, metadata !"fpexcept.strict")
@@ -242,9 +242,9 @@ define double @v_constrained_fpext_fneg_f32_to_f64_fpexcept_strict(float %arg) #
 define double @v_constrained_fneg_fpext_f32_to_f64_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fneg_fpext_f32_to_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.fpext.f64.f32(float %arg, metadata !"fpexcept.strict")
   %neg.result = fneg double %result
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
index ec2bc43ca367873..fcb243efc0cd8e6 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
@@ -8,8 +8,8 @@
 define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -18,27 +18,27 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
 define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg) #0 {
 ; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1011-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX1011-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -47,30 +47,30 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo
 define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x float> %arg) #0 {
 ; GFX8-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict:
 ; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1011-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX1011-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX1011-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -79,8 +79,8 @@ define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x flo
 define float @v_constrained_fptrunc_f64_to_f32_fpexcept_strict(double %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_f64_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -89,9 +89,9 @@ define float @v_constrained_fptrunc_f64_to_f32_fpexcept_strict(double %arg) #0 {
 define <2 x float> @v_constrained_fptrunc_v2f64_to_v2f32_fpexcept_strict(<2 x double> %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_v2f64_to_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v1, v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fptrunc.v2f32.v2f64(<2 x double> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -100,10 +100,10 @@ define <2 x float> @v_constrained_fptrunc_v2f64_to_v2f32_fpexcept_strict(<2 x do
 define <3 x float> @v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict(<3 x double> %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v1, v[2:3]
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v2, v[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64(<3 x double> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -128,9 +128,9 @@ define <3 x float> @v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict(<3 x do
 define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   %neg.val = fneg half %val
@@ -140,8 +140,8 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #
 define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg float %arg
   %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -151,9 +151,9 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #
 define float @v_constrained_fneg_fptrunc_f64_to_f32_fpexcept_strict(double %arg) #0 {
 ; GCN-LABEL: v_constrained_fneg_fptrunc_f64_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
   %neg.val = fneg float %val
@@ -163,8 +163,8 @@ define float @v_constrained_fneg_fptrunc_f64_to_f32_fpexcept_strict(double %arg)
 define float @v_constrained_fptrunc_fneg_f64_to_f32_fpexcept_strict(double %arg) #0 {
 ; GCN-LABEL: v_constrained_fptrunc_fneg_f64_to_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_f64_e64 v0, -v[0:1]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg double %arg
   %val = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
index 3420596da2aacb7..d4e490d41515044 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll
@@ -16,14 +16,14 @@
 define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f16_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret half %val
@@ -32,14 +32,14 @@ define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 {
 define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f16_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret half %val
@@ -48,14 +48,14 @@ define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 {
 define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f16_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret half %val
@@ -64,64 +64,64 @@ define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 {
 define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX10PLUS-SDAG:       ; %bb.0:
-; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v2, v3, v2
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
 ; GFX10PLUS-GISEL:       ; %bb.0:
-; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x half> %val
@@ -130,64 +130,64 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX10PLUS-SDAG:       ; %bb.0:
-; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v2, v3, v2
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
 ; GFX10PLUS-GISEL:       ; %bb.0:
-; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x half> %val
@@ -196,64 +196,64 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h
 define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_e32 v2, v0, v1
 ; GFX8-GISEL-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX10PLUS-SDAG:       ; %bb.0:
-; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v1
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v2, v3, v2
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
 ; GFX10PLUS-GISEL:       ; %bb.0:
-; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x half> %val
@@ -262,74 +262,73 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x
 define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX9-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX10-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX10PLUS-SDAG:       ; %bb.0:
-; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v2, v5, v4
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
 ; GFX10PLUS-GISEL:       ; %bb.0:
-; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10PLUS-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX10PLUS-GISEL-NEXT:    v_sub_f16_e32 v0, v0, v2
@@ -337,6 +336,7 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 ; GFX10PLUS-GISEL-NEXT:    v_sub_f16_e32 v2, v4, v5
 ; GFX10PLUS-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10PLUS-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x half> %val
@@ -346,7 +346,6 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 {
 ; GFX9-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
@@ -354,55 +353,55 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v5, v0, s4
 ; GFX9-SDAG-NEXT:    v_perm_b32 v1, v4, v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX9-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_sub_f16_e32 v2, v1, v3
 ; GFX9-GISEL-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX8-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX8-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_sub_f16_e32 v2, v1, v3
 ; GFX8-GISEL-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_sub_f16_e32 v1, v1, v3
 ; GFX10-SDAG-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
 ; GFX10-SDAG-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_sub_f16_e32 v4, v0, v2
 ; GFX10-GISEL-NEXT:    v_sub_f16_e32 v5, v1, v3
 ; GFX10-GISEL-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -411,11 +410,11 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX10PLUS-SDAG:       ; %bb.0:
-; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX10PLUS-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
@@ -426,11 +425,11 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10PLUS-SDAG-NEXT:    v_sub_f16_e32 v3, v7, v4
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX10PLUS-SDAG-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10PLUS-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
 ; GFX10PLUS-GISEL:       ; %bb.0:
-; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10PLUS-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX10PLUS-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
@@ -443,6 +442,7 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX10PLUS-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10PLUS-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX10PLUS-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX10PLUS-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll
index 23dbe21379f7ff4..bacf874a6c33f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll
@@ -11,14 +11,14 @@
 define float @v_constained_fsub_f32_fpexcept_strict(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %val
@@ -27,14 +27,14 @@ define float @v_constained_fsub_f32_fpexcept_strict(float %x, float %y) #0 {
 define float @v_constained_fsub_f32_fpexcept_ignore(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_ignore:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -43,14 +43,14 @@ define float @v_constained_fsub_f32_fpexcept_ignore(float %x, float %y) #0 {
 define float @v_constained_fsub_f32_fpexcept_maytrap(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_maytrap:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret float %val
@@ -59,22 +59,22 @@ define float @v_constained_fsub_f32_fpexcept_maytrap(float %x, float %y) #0 {
 define <2 x float> @v_constained_fsub_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x float> %val
@@ -83,22 +83,22 @@ define <2 x float> @v_constained_fsub_v2f32_fpexcept_strict(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fsub_v2f32_fpexcept_ignore(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f32_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_ignore:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %val
@@ -107,22 +107,22 @@ define <2 x float> @v_constained_fsub_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
 define <2 x float> @v_constained_fsub_v2f32_fpexcept_maytrap(<2 x float> %x, <2 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f32_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f32_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_maytrap:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x float> %val
@@ -131,25 +131,25 @@ define <2 x float> @v_constained_fsub_v2f32_fpexcept_maytrap(<2 x float> %x, <2
 define <3 x float> @v_constained_fsub_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v3f32_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
 ; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v3f32_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v3
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_constained_fsub_v3f32_fpexcept_strict:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v3 :: v_dual_sub_f32 v1, v1, v4
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x float> %val
@@ -173,14 +173,14 @@ define amdgpu_ps float @s_constained_fsub_f32_fpexcept_strict(float inreg %x, fl
 define float @v_constained_fsub_f32_fpexcept_strict_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e64 v0, |v0|, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e64 v0, |v0|, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %val = call float @llvm.experimental.constrained.fsub.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -190,14 +190,14 @@ define float @v_constained_fsub_f32_fpexcept_strict_fabs_lhs(float %x, float %y)
 define float @v_constained_fsub_f32_fpexcept_strict_fabs_rhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_rhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e64 v0, v0, |v1|
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_rhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e64 v0, v0, |v1|
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call float @llvm.fabs.f32(float %y) #0
   %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict")
@@ -207,14 +207,14 @@ define float @v_constained_fsub_f32_fpexcept_strict_fabs_rhs(float %x, float %y)
 define float @v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs(float %x, float %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_sub_f32_e64 v0, -|v0|, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_f32_e64 v0, -|v0|, v1
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call float @llvm.fabs.f32(float %x) #0
   %neg.fabs.x = fneg float %fabs.x
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
index e7d136c377079e1..ff01a7c09284803 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll
@@ -9,14 +9,14 @@
 define double @v_constained_fsub_f64_fpexcept_strict(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %val
@@ -25,14 +25,14 @@ define double @v_constained_fsub_f64_fpexcept_strict(double %x, double %y) #0 {
 define double @v_constained_fsub_f64_fpexcept_ignore(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret double %val
@@ -41,14 +41,14 @@ define double @v_constained_fsub_f64_fpexcept_ignore(double %x, double %y) #0 {
 define double @v_constained_fsub_f64_fpexcept_maytrap(double %x, double %y) #0 {
 ; GCN-LABEL: v_constained_fsub_f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret double %val
@@ -57,16 +57,16 @@ define double @v_constained_fsub_f64_fpexcept_maytrap(double %x, double %y) #0 {
 define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <2 x double> %val
@@ -75,16 +75,16 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_strict(<2 x double> %x, <2
 define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f64_fpexcept_ignore:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_ignore:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %val
@@ -93,16 +93,16 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_ignore(<2 x double> %x, <2
 define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, <2 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v2f64_fpexcept_maytrap:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v2f64_fpexcept_maytrap:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %x, <2 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
   ret <2 x double> %val
@@ -111,18 +111,18 @@ define <2 x double> @v_constained_fsub_v2f64_fpexcept_maytrap(<2 x double> %x, <
 define <3 x double> @v_constained_fsub_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y) #0 {
 ; GCN-LABEL: v_constained_fsub_v3f64_fpexcept_strict:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], -v[6:7]
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], -v[8:9]
 ; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], -v[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fsub_v3f64_fpexcept_strict:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], -v[6:7]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[2:3], -v[8:9]
 ; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[10:11]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(<3 x double> %x, <3 x double> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <3 x double> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll
index caf04b5b6f17e71..3577108420adc7d 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll
@@ -18,56 +18,56 @@
 define half @test_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #0 {
 ; GFX8-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v0, v3, s4, v0
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v0, v3, s4, v0
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_f16_i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_med3_i32 v0, v3, s0, 0x7fff
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v0, v3, v0, v1
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v0, v3, v0, v1
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v3, v0
 ; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.experimental.constrained.ldexp.f16.i32(half %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %result
@@ -81,7 +81,6 @@ define half @test_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #0 {
 define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a, <2 x i32> %b) #0 {
 ; GFX8-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v1, v3, s4, v0
@@ -89,11 +88,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v1, v2, v1
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v3, s4, v0
@@ -102,11 +101,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_med3_i32 v0, v3, s0, 0x7fff
@@ -116,11 +115,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v3, v1
 ; GFX11-SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v3, v3, v0, v1
@@ -128,11 +127,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v3, v2, v3
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v3, v3, v0, v1
@@ -140,11 +139,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v3, v2, v3
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v3, v0
@@ -156,6 +155,7 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x half> %result
@@ -164,7 +164,6 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a,
 define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a, <3 x i32> %b) #0 {
 ; GFX8-SDAG-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v0, v4, s4, v1
@@ -174,11 +173,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX8-SDAG-NEXT:    v_med3_i32 v1, v6, s4, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v0, v4, s4, v1
@@ -189,11 +188,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v6, s4, v1
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v2, v0, s5
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_med3_i32 v0, v4, s0, 0x7fff
@@ -206,11 +205,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX11-SDAG-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v3, v2
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v4, v4, v0, v1
@@ -220,11 +219,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX8-GISEL-NEXT:    v_med3_i32 v0, v6, v0, v1
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v2
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v4, v4, v0, v1
@@ -234,11 +233,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX9-GISEL-NEXT:    v_med3_i32 v0, v6, v0, v1
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v0
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v4, v0
@@ -252,6 +251,7 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
 ; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v4
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x half> @llvm.experimental.constrained.ldexp.v3f16.v3i32(<3 x half> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <3 x half> %result
@@ -260,7 +260,6 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a,
 define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a, <4 x i32> %b) #0 {
 ; GFX8-SDAG-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX8-SDAG-NEXT:    v_med3_i32 v1, v7, s4, v0
@@ -273,11 +272,11 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX8-SDAG-NEXT:    v_ldexp_f16_e32 v0, v2, v0
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v5
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v6, s4, v0
@@ -291,11 +290,11 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX9-SDAG-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_med3_i32 v0, v6, s0, 0x7fff
@@ -312,11 +311,11 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SDAG-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX8-GISEL-NEXT:    v_med3_i32 v4, v4, v0, v1
@@ -329,11 +328,11 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX8-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fff
 ; GFX9-GISEL-NEXT:    v_med3_i32 v4, v4, v0, v1
@@ -346,11 +345,11 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v4
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
@@ -371,6 +370,7 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a,
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_lshl_or_b32 v1, v4, 16, v2
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x half> @llvm.experimental.constrained.ldexp.v4f16.v4i32(<4 x half> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x half> %result
diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll
index 0657bfbc3866419..2fb9364f6bd1d7b 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll
@@ -17,26 +17,26 @@
 define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
 ; GFX6-LABEL: test_ldexp_f32_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_ldexp_f32_e32 v0, v2, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_ldexp_f32_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: test_ldexp_f32_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_f32_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.ldexp.f32.i32(float %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret float %result
@@ -50,51 +50,51 @@ define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
 define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a, <2 x i32> %b) #0 {
 ; GFX6-SDAG-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v3, v5
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v2, v4
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v1, v3, v5
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v0, v2, v4
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v1, v3, v5
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v2, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v2, v4
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v3, v5
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v2f32_v2i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v2, v4
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v1, v3, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.experimental.constrained.ldexp.v2f32.v2i32(<2 x float> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %result
@@ -103,61 +103,61 @@ define <2 x float> @test_ldexp_v2f32_v2i32(ptr addrspace(1) %out, <2 x float> %a
 define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a, <3 x i32> %b) #0 {
 ; GFX6-SDAG-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v4, v4, v7
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v3, v6
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v2, v5
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v2, v4
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v4, v4, v7
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v4, v4, v7
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX11-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v2, v5
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v3, v6
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v4, v7
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v3f32_v3i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v2, v5
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v1, v3, v6
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v2, v4, v7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.experimental.constrained.ldexp.v3f32.v3i32(<3 x float> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <3 x float> %result
@@ -166,71 +166,71 @@ define <3 x float> @test_ldexp_v3f32_v3i32(ptr addrspace(1) %out, <3 x float> %a
 define <4 x float> @test_ldexp_v4f32_v4i32(ptr addrspace(1) %out, <4 x float> %a, <4 x i32> %b) #0 {
 ; GFX6-SDAG-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX6-SDAG:       ; %bb.0:
-; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v5, v5, v9
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v4, v4, v8
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v1, v3, v7
 ; GFX6-SDAG-NEXT:    v_ldexp_f32_e32 v0, v2, v6
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX6-SDAG-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v5, v5, v9
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v4, v4, v8
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX8-SDAG-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX8-SDAG-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v5, v5, v9
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v4, v4, v8
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX9-SDAG-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, v4
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX11-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX11-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX11-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX6-GISEL:       ; %bb.0:
-; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v2, v6
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v1, v3, v7
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v2, v4, v8
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v3, v5, v9
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX8-GISEL-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_ldexp_v4f32_v4i32:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v2, v6
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v1, v3, v7
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v2, v4, v8
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v3, v5, v9
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.experimental.constrained.ldexp.v4f32.v4i32(<4 x float> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll
index e334340d8f2ae6a..1de3401cd017124 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f64.ll
@@ -17,8 +17,8 @@
 define double @test_ldexp_f64_i32(ptr addrspace(1) %out, double %a, i32 %b) #0 {
 ; GCN-LABEL: test_ldexp_f64_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[2:3], v4
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.ldexp.f64.i32(double %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %result
@@ -32,9 +32,9 @@ define double @test_ldexp_f64_i32(ptr addrspace(1) %out, double %a, i32 %b) #0 {
 define <2 x double> @test_ldexp_v2f64_v2i32(ptr addrspace(1) %out, <2 x double> %a, <2 x i32> %b) #0 {
 ; GCN-LABEL: test_ldexp_v2f64_v2i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[2:3], v6
 ; GCN-NEXT:    v_ldexp_f64 v[2:3], v[4:5], v7
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.experimental.constrained.ldexp.v2f64.v2i32(<2 x double> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x double> %result
@@ -43,10 +43,10 @@ define <2 x double> @test_ldexp_v2f64_v2i32(ptr addrspace(1) %out, <2 x double>
 define <3 x double> @test_ldexp_v3f64_v3i32(ptr addrspace(1) %out, <3 x double> %a, <3 x i32> %b) #0 {
 ; GCN-LABEL: test_ldexp_v3f64_v3i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[2:3], v8
 ; GCN-NEXT:    v_ldexp_f64 v[2:3], v[4:5], v9
 ; GCN-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v10
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.experimental.constrained.ldexp.v3f64.v3i32(<3 x double> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <3 x double> %result
@@ -55,11 +55,11 @@ define <3 x double> @test_ldexp_v3f64_v3i32(ptr addrspace(1) %out, <3 x double>
 define <4 x double> @test_ldexp_v4f64_v4i32(ptr addrspace(1) %out, <4 x double> %a, <4 x i32> %b) #0 {
 ; GCN-LABEL: test_ldexp_v4f64_v4i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_ldexp_f64 v[0:1], v[2:3], v10
 ; GCN-NEXT:    v_ldexp_f64 v[2:3], v[4:5], v11
 ; GCN-NEXT:    v_ldexp_f64 v[4:5], v[6:7], v12
 ; GCN-NEXT:    v_ldexp_f64 v[6:7], v[8:9], v13
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x double> @llvm.experimental.constrained.ldexp.v4f64.v4i32(<4 x double> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %result
diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
index 4f33e19835172a7..d73402a0e304ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll
@@ -6,7 +6,6 @@
 define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6) unnamed_addr align 2 {
 ; CHECK-LABEL: bar:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v15, v12
 ; CHECK-NEXT:    v_mov_b32_e32 v14, v11
 ; CHECK-NEXT:    v_mov_b32_e32 v13, v10
@@ -52,6 +51,7 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a
 ; CHECK-NEXT:    s_addc_u32 s17, s17, quux at rel32@hi+12
 ; CHECK-NEXT:    s_setpc_b64 s[16:17]
 ; CHECK-NEXT:  .LBB0_5: ; %bb9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   switch i32 undef, label %bb9 [
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
index 88c1fd9f66b33dd..5f6ba4e3917ad7d 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll
@@ -7,8 +7,8 @@
 define hidden amdgpu_gfx float @callee(float %v.arg0, float %v.arg1) {
 ; GCN-LABEL: callee:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %v.arg0, %v.arg1
   ret float %add
@@ -17,7 +17,6 @@ define hidden amdgpu_gfx float @callee(float %v.arg0, float %v.arg1) {
 define amdgpu_gfx float @caller(float %arg0) {
 ; GCN-SELDAG-LABEL: caller:
 ; GCN-SELDAG:       ; %bb.0:
-; GCN-SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SELDAG-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-SELDAG-NEXT:    s_mov_b32 s37, callee at abs32@hi
 ; GCN-SELDAG-NEXT:    s_mov_b32 s36, callee at abs32@lo
@@ -26,7 +25,6 @@ define amdgpu_gfx float @caller(float %arg0) {
 ;
 ; GCN-GISEL-LABEL: caller:
 ; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-GISEL-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GCN-GISEL-NEXT:    s_mov_b32 s36, callee at abs32@lo
 ; GCN-GISEL-NEXT:    s_mov_b32 s37, callee at abs32@hi
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index d200b25c17d33bd..0f170233d3a6d49 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -6,8 +6,8 @@
 define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
 ; GCN-LABEL: trunc_bitcast_i64_lshr_32_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %srl = lshr i64 %bar, 32
   %trunc = trunc i64 %srl to i16
@@ -17,8 +17,8 @@ define i16 @trunc_bitcast_i64_lshr_32_i16(i64 %bar) {
 define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
 ; GCN-LABEL: trunc_bitcast_i64_lshr_32_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %srl = lshr i64 %bar, 32
   %trunc = trunc i64 %srl to i32
@@ -28,20 +28,20 @@ define i32 @trunc_bitcast_i64_lshr_32_i32(i64 %bar) {
 define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
 ; SI-LABEL: trunc_bitcast_v2i32_to_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; SI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: trunc_bitcast_v2i32_to_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 4, v0
+; VI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load i32, ptr addrspace(1) undef
   %load1 = load i32, ptr addrspace(1) null
@@ -57,20 +57,20 @@ define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
 define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
 ; SI-LABEL: trunc_bitcast_v2f32_to_i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; SI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: trunc_bitcast_v2f32_to_i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 4, v0
+; VI-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load float, ptr addrspace(1) undef
   %load1 = load float, ptr addrspace(1) null
@@ -140,18 +140,18 @@ bb:
 define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
 ; SI-LABEL: trunc_v2i64_arg_to_v2i16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: trunc_v2i64_arg_to_v2i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x1000504
 ; VI-NEXT:    v_perm_b32 v0, v0, v2, s4
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %trunc = trunc <2 x i64> %arg0 to <2 x i16>
   ret <2 x i16> %trunc
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 05de0bc5f282ade..40be996623be721 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -7,9 +7,9 @@ declare void @wobble()
 define internal fastcc void @widget() {
 ; GFX90A-LABEL: widget:
 ; GFX90A:       ; %bb.0: ; %bb
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s16, s33
 ; GFX90A-NEXT:    s_mov_b32 s33, s32
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index b714fda6f1d0764..eebb7cf30d153c0 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -7,34 +7,34 @@
 define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_min_u32_e32 v0, 0xff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_min_u16_e32 v0, 0xff, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_min_u16_e32 v0, 0xff, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_i8:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-NEXT:    v_min_u16 v0, 0xff, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -43,29 +43,29 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -74,28 +74,28 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
 define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v2, v1
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -104,7 +104,6 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
 define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -115,26 +114,27 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -143,7 +143,6 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
@@ -160,29 +159,30 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
 ; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v2 clamp
 ; GFX8-NEXT:    v_add_u16_e64 v1, v1, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v3i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
   ret <3 x i16> %result
@@ -191,7 +191,6 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -212,31 +211,32 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v0, v0, v2 clamp
 ; GFX8-NEXT:    v_add_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_add_u16_e64 v1, v1, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v4i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
 ; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -246,34 +246,34 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v4, v2
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_not_b32_e32 v2, v3
 ; GFX6-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v3 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v2i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v2 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v3 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -282,7 +282,6 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v6, v3
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
@@ -292,30 +291,31 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_min_u32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v3 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v4 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v5 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v3i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v3 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v4 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v5 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -324,7 +324,6 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v8, v4
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
@@ -337,33 +336,34 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v4 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v5 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v6 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v7 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v4i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v4 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v5 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v6 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v7 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -372,7 +372,6 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v8i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v16, v8
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v16
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
@@ -397,11 +396,11 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v8i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v8 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v9 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v10 clamp
@@ -410,11 +409,11 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v5, s[4:5], v5, v13 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v6, s[4:5], v6, v14 clamp
 ; GFX8-NEXT:    v_add_u32_e64 v7, s[4:5], v7, v15 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v8i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v8 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v9 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v2, v2, v10 clamp
@@ -423,11 +422,11 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX9-NEXT:    v_add_u32_e64 v5, v5, v13 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v6, v6, v14 clamp
 ; GFX9-NEXT:    v_add_u32_e64 v7, v7, v15 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v8i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v8 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v9 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v10 clamp
@@ -436,6 +435,7 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX10-NEXT:    v_add_nc_u32_e64 v5, v5, v13 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v6, v6, v14 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v7, v7, v15 clamp
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
   ret <8 x i32> %result
@@ -444,7 +444,6 @@ define <8 x i32> @v_uaddsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_uaddsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_not_b32_e32 v31, v16
 ; GFX6-NEXT:    v_min_u32_e32 v0, v0, v31
 ; GFX6-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -495,11 +494,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_not_b32_e32 v16, v31
 ; GFX6-NEXT:    v_min_u32_e32 v15, v15, v16
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v31
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v16 clamp
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v17 clamp
@@ -518,11 +517,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30 clamp
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e64 v15, s[4:5], v15, v16 clamp
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_add_u32_e64 v1, v1, v17 clamp
@@ -541,11 +540,11 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_add_u32_e64 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e64 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v17 clamp
@@ -564,6 +563,7 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_add_nc_u32_e64 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e64 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -573,42 +573,42 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_uaddsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_uaddsat_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc_lo
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 10f45b2c8aba01f..79fa9d1432a7d77 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -2519,7 +2519,6 @@ bb:
 define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-LABEL: v_test_udiv64_mulhi_fold:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, 0x346d900
 ; SI-NEXT:    s_add_u32 s4, 0x4237, s4
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xa9000000
@@ -2593,11 +2592,11 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_test_udiv64_mulhi_fold:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, 0x346d900
 ; VI-NEXT:    s_add_u32 s4, 0x4237, s4
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xa9000000
@@ -2662,11 +2661,11 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-LABEL: v_test_udiv64_mulhi_fold:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s4, 0x346d900
 ; GCN-NEXT:    s_add_u32 s4, 0x4237, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0xa9000000
@@ -2731,11 +2730,11 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1030-LABEL: v_test_udiv64_mulhi_fold:
 ; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-NEXT:    s_mov_b32 s4, 0x346d900
 ; GFX1030-NEXT:    s_add_u32 s4, 0x4237, s4
 ; GFX1030-NEXT:    s_addc_u32 s5, 0, 0
@@ -2800,6 +2799,7 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) {
 ; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc_lo
 ; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc_lo
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1030-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; EG-LABEL: v_test_udiv64_mulhi_fold:
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 6102d652402a210..92c7ae24f4a1b16 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -206,7 +206,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_udiv_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
@@ -311,11 +310,11 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v11, v9, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
@@ -336,6 +335,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -396,6 +396,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v5
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, %y
   ret i64 %result
@@ -461,7 +462,6 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_udiv24_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
@@ -475,11 +475,11 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv24_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
@@ -493,6 +493,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
@@ -1062,7 +1063,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
@@ -1152,11 +1152,11 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v9, v7, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -1172,6 +1172,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1231,6 +1232,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
@@ -1239,14 +1241,13 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 15
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -1261,6 +1262,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB10_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1317,6 +1319,7 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 32768
   ret i64 %result
@@ -1475,7 +1478,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, 0, 0xaaaa0000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GCN-NEXT:    v_mul_hi_u32 v2, s4, v2
@@ -1543,11 +1545,11 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v8, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -1562,6 +1564,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v2, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1617,6 +1620,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 24
   ret i64 %result
@@ -1723,7 +1727,6 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x
 define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv24_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
@@ -1736,11 +1739,11 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv24_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
@@ -1753,6 +1756,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
@@ -1762,7 +1766,6 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1775,11 +1778,11 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv24_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1792,6 +1795,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 32768, %x.shr
@@ -1801,14 +1805,13 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_udiv24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 23, v1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_udiv24_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1820,6 +1823,7 @@ define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 32768
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index ba52d702c7ed112..bd2f8591c1fbd43 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -381,17 +381,17 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 define double @v_uint_to_fp_i8_to_f64(i8 %in) {
 ; SI-LABEL: v_uint_to_fp_i8_to_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_uint_to_fp_i8_to_f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %fp = uitofp i8 %in to double
   ret double %fp
@@ -434,13 +434,12 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 1.0, double 0.0
@@ -485,13 +484,12 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
@@ -537,13 +535,12 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 0873638d335bbe7..512d68afcd63551 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1137,7 +1137,6 @@ done:
 define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
 ; SI-LABEL: move_to_valu_vgpr_operand_phi:
 ; SI:       ; %bb.0: ; %bb0
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 28, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 1
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
@@ -1152,15 +1151,15 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
 ; SI-NEXT:  ; %bb.3: ; %bb2
 ; SI-NEXT:    ; in Loop: Header=BB20_2 Depth=1
 ; SI-NEXT:    ds_write_b32 v0, v1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
 ; SI-NEXT:    s_cbranch_vccz .LBB20_1
 ; SI-NEXT:  ; %bb.4: ; %DummyReturnBlock
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: move_to_valu_vgpr_operand_phi:
 ; VI:       ; %bb.0: ; %bb0
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 28, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, 1
 ; VI-NEXT:    s_and_b64 vcc, exec, 0
@@ -1178,7 +1177,7 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
 ; VI-NEXT:    s_mov_b64 vcc, vcc
 ; VI-NEXT:    s_cbranch_vccz .LBB20_1
 ; VI-NEXT:  ; %bb.4: ; %DummyReturnBlock
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 bb0:
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 86e2822a3e5b164..4280bdfe9602986 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -5,9 +5,9 @@
 define hidden void @widget() {
 ; GCN-LABEL: widget:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
@@ -37,8 +37,11 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
+; GCN-NEXT:    s_and_b64 vcc, exec, vcc
 ; GCN-NEXT:    s_mov_b64 s[46:47], 0
 ; GCN-NEXT:    s_mov_b64 s[18:19], 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-NEXT:    s_cbranch_vccz .LBB0_9
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
@@ -253,9 +256,9 @@ define hidden void @blam() {
 ;
 ; GCN-LABEL: blam:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
index a0dd0e7e78f9db8..639bd1e96b05f79 100644
--- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll
@@ -4,13 +4,13 @@
 define i1 @test_urem_odd(i13 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
 ; CHECK-NEXT:    s_movk_i32 s4, 0x667
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0xccd, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1fff, v0
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i13 %X, 5
   %cmp = icmp eq i13 %urem, 0
@@ -20,7 +20,6 @@ define i1 @test_urem_odd(i13 %X) nounwind {
 define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-LABEL: test_urem_even:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_mov_b32 s4, 0x6db6db7
 ; CHECK-NEXT:    s_mov_b32 s5, 0x924925
 ; CHECK-NEXT:    v_mul_lo_u32 v0, v0, s4
@@ -30,6 +29,7 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ffffff, v0
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, s5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i27 %X, 14
   %cmp = icmp eq i27 %urem, 0
@@ -39,12 +39,12 @@ define i1 @test_urem_even(i27 %X) nounwind {
 define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; CHECK-LABEL: test_urem_odd_setne:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 13, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 15, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 3, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i4 %X, 5
   %cmp = icmp ne i4 %urem, 0
@@ -54,12 +54,12 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 define i1 @test_urem_negative_odd(i9 %X) nounwind {
 ; CHECK-LABEL: test_urem_negative_odd:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; CHECK-NEXT:    v_mul_u32_u24_e32 v0, 0x133, v0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem i9 %X, -5
   %cmp = icmp ne i9 %urem, 0
@@ -69,7 +69,6 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind {
 define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-LABEL: test_urem_vec:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7ff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7ff, v1
 ; CHECK-NEXT:    v_and_b32_e32 v2, 0x7ff, v2
@@ -91,6 +90,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %urem = urem <3 x i11> %X, <i11 6, i11 7, i11 -5>
   %cmp = icmp ne <3 x i11> %urem, <i11 0, i11 1, i11 2>
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 7e8661b3ff32dd1..d9d01a8bf0db8cf 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -216,7 +216,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-LABEL: v_test_urem_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
@@ -320,11 +319,11 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v4, v2
 ; GCN-IR-NEXT:    v_add_i32_e64 v4, s[6:7], 32, v4
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v5, v3
@@ -345,6 +344,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB1_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -411,6 +411,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, %y
   ret i64 %result
@@ -1056,7 +1057,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_urem_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v0
@@ -1145,11 +1145,11 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -1165,6 +1165,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1230,6 +1231,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 32768, %x
   ret i64 %result
@@ -1238,14 +1240,13 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_urem_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_add_i32_e64 v2, s[4:5], 32, v2
 ; GCN-IR-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -1260,6 +1261,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[4:5]
 ; GCN-IR-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GCN-IR-NEXT:    s_waitcnt expcnt(0)
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1317,6 +1319,7 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshl_b64 v[2:3], v[4:5], 15
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, 32768
   ret i64 %result
@@ -1433,7 +1436,6 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x
 define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_urem24_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
@@ -1448,11 +1450,11 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem24_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
@@ -1467,6 +1469,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
@@ -1476,7 +1479,6 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-LABEL: v_test_urem24_pow2_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1491,11 +1493,11 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem24_pow2_k_num_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1510,6 +1512,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 32768, %x.shr
@@ -1519,14 +1522,13 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) {
 ; GCN-LABEL: v_test_urem24_pow2_k_den_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_bfe_u32 v0, v1, 8, 15
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-IR-LABEL: v_test_urem24_pow2_k_den_i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
@@ -1540,6 +1542,7 @@ define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 32768
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 8cc7025d671c471..86e574283916cd1 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -8,31 +8,31 @@
 define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX6-LABEL: v_usubsat_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result
@@ -41,29 +41,29 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
 define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX6-LABEL: v_usubsat_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
   ret i16 %result
@@ -72,31 +72,31 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
 define i16 @usubsat_as_bithack_i16(i16 %x) {
 ; GFX6-LABEL: usubsat_as_bithack_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: usubsat_as_bithack_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: usubsat_as_bithack_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: usubsat_as_bithack_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %signsplat = ashr i16 %x, 15
   %flipsign = xor i16 %x, 32768
@@ -107,31 +107,31 @@ define i16 @usubsat_as_bithack_i16(i16 %x) {
 define i16 @usubsat_as_bithack2_i16(i16 %x) {
 ; GFX6-LABEL: usubsat_as_bithack2_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, v1, v0
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: usubsat_as_bithack2_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: usubsat_as_bithack2_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: usubsat_as_bithack2_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %signsplat = ashr i16 %x, 15
   %flipsign = add i16 %x, 32768
@@ -142,31 +142,31 @@ define i16 @usubsat_as_bithack2_i16(i16 %x) {
 define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
 ; GFX6-LABEL: usubsat_as_bithack_commute_i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 15, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0xffff8000, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: usubsat_as_bithack_commute_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: usubsat_as_bithack_commute_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-NEXT:    v_sub_u16_e64 v0, v0, s4 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u16 v0, v0, 0x8000 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %signsplat = ashr i16 %x, 15
   %flipsign = add i16 %x, 32768
@@ -177,27 +177,27 @@ define i16 @usubsat_as_bithack_commute_i16(i16 %x) {
 define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-LABEL: v_usubsat_i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v1 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs)
   ret i32 %result
@@ -206,7 +206,6 @@ define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) {
 define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
@@ -217,26 +216,27 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v1 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v2i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
   ret <2 x i16> %result
@@ -245,7 +245,6 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
@@ -261,29 +260,30 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_u32_e32 v1, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v5
 ; GFX6-NEXT:    v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
 ; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v3i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v3i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
   ret <3 x i16> %result
@@ -292,7 +292,6 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-LABEL: v_usubsat_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_and_b32_e32 v9, 0xffff, v5
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -313,31 +312,32 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u16_sdwa v4, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v0, v0, v2 clamp
 ; GFX8-NEXT:    v_sub_u16_sdwa v2, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_sub_u16_e64 v1, v1, v3 clamp
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v4i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v4i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_pk_sub_u16 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
   %cast = bitcast <4 x i16> %result to <2 x float>
@@ -347,32 +347,32 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v2i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v2 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v3 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v2i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v3 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v2i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v2 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v3 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
   ret <2 x i32> %result
@@ -381,37 +381,37 @@ define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v3i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_max_u32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v3i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v3 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v4 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v5 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v3i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v3 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v4 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v5 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v3i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v3 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v5 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
   ret <3 x i32> %result
@@ -420,7 +420,6 @@ define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v4i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_max_u32_e32 v2, v2, v6
@@ -429,33 +428,34 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v4 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v5 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v6 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v3, v7 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v4i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v4 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v5 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v6 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v3, v3, v7 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v4i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v4 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v5 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v6 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v3, v3, v7 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
   ret <4 x i32> %result
@@ -464,7 +464,6 @@ define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v8i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v8
 ; GFX6-NEXT:    v_max_u32_e32 v1, v1, v9
 ; GFX6-NEXT:    v_max_u32_e32 v2, v2, v10
@@ -481,11 +480,11 @@ define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v13
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v14
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v15
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v8i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v8 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v9 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v2, v10 clamp
@@ -494,11 +493,11 @@ define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v5, s[4:5], v5, v13 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v6, s[4:5], v6, v14 clamp
 ; GFX8-NEXT:    v_sub_u32_e64 v7, s[4:5], v7, v15 clamp
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v8i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v8 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v9 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v2, v2, v10 clamp
@@ -507,11 +506,11 @@ define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e64 v5, v5, v13 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v6, v6, v14 clamp
 ; GFX9-NEXT:    v_sub_u32_e64 v7, v7, v15 clamp
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_v8i32:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v0, v0, v8 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v1, v1, v9 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v2, v2, v10 clamp
@@ -520,6 +519,7 @@ define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v5, v5, v13 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v6, v6, v14 clamp
 ; GFX10PLUS-NEXT:    v_sub_nc_u32_e64 v7, v7, v15 clamp
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs)
   ret <8 x i32> %result
@@ -528,7 +528,6 @@ define <8 x i32> @v_usubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_usubsat_v16i32:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v0, v0, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
@@ -563,11 +562,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_max_u32_e32 v15, v15, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_v16i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[4:5], v0, v16 clamp
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v1, v17 clamp
@@ -586,11 +585,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v14, s[4:5], v14, v30 clamp
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e64 v15, s[4:5], v15, v16 clamp
+; GFX8-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_v16i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v0, v0, v16 clamp
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32
 ; GFX9-NEXT:    v_sub_u32_e64 v1, v1, v17 clamp
@@ -609,11 +608,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX9-NEXT:    v_sub_u32_e64 v14, v14, v30 clamp
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_u32_e64 v15, v15, v16 clamp
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_usubsat_v16i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
@@ -632,11 +631,11 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_usubsat_v16i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v0, v0, v16 clamp
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v1, v1, v17 clamp
@@ -655,6 +654,7 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v14, v14, v30 clamp
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_sub_nc_u32_e64 v15, v15, v31 clamp
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
   ret <16 x i32> %result
@@ -664,42 +664,42 @@ define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-LABEL: v_usubsat_i64:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_usubsat_i64:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_usubsat_i64:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, v2
 ; GFX10PLUS-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
 ; GFX10PLUS-NEXT:    v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
 ; GFX10PLUS-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index e46992ccbbc57d4..ed7842647eef3bc 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -18,63 +18,63 @@ declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>)
 define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
 ; SDAG-VI-LABEL: basic_smax_smin:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; SDAG-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: basic_smax_smin:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; SDAG-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; SDAG-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; SDAG-GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; SDAG-GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-LABEL: basic_smax_smin:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; SDAG-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: basic_smax_smin:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; GISEL-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: basic_smax_smin:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; GISEL-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; GISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX11-LABEL: basic_smax_smin:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; GISEL-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
   %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
   %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
@@ -211,63 +211,63 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
 ; SDAG-VI-LABEL: basic_smin_smax:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v2, 0
 ; SDAG-VI-NEXT:    v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: basic_smin_smax:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; SDAG-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; SDAG-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; SDAG-GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; SDAG-GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-LABEL: basic_smin_smax:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; SDAG-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: basic_smin_smax:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; GISEL-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; GISEL-VI-NEXT:    v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: basic_smin_smax:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; GISEL-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; GISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX11-LABEL: basic_smin_smax:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; GISEL-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
   %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
   %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
@@ -281,63 +281,63 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
 define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
 ; SDAG-VI-LABEL: basic_smin_smax_combined:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; SDAG-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: basic_smin_smax_combined:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; SDAG-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; SDAG-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; SDAG-GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; SDAG-GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-LABEL: basic_smin_smax_combined:
 ; SDAG-GFX11:       ; %bb.0:
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; SDAG-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: basic_smin_smax_combined:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; GISEL-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: basic_smin_smax_combined:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-GFX9-NEXT:    v_med3_i16 v0, v0, 0, v2
 ; GISEL-GFX9-NEXT:    v_med3_i16 v1, v1, 0, v2
 ; GISEL-GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX11-LABEL: basic_smin_smax_combined:
 ; GISEL-GFX11:       ; %bb.0:
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    v_med3_i16 v0, v0, 0, 0xff
 ; GISEL-GFX11-NEXT:    v_med3_i16 v1, v1, 0, 0xff
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GISEL-GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
   %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
   %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
@@ -351,7 +351,6 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
 define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
 ; SDAG-VI-LABEL: vec_smax_smin:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-VI-NEXT:    v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_max_i16_e32 v0, 0, v0
@@ -359,27 +358,27 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
 ; SDAG-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: vec_smax_smin:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
 ; SDAG-GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; SDAG-GFX9-NEXT:    v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_smax_smin:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: vec_smax_smin:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v0
 ; GISEL-VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -387,14 +386,15 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
 ; GISEL-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: vec_smax_smin:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff00ff
 ; GISEL-GFX9-NEXT:    v_pk_min_i16 v0, v0, v1
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
   %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> <i16 0, i16 0>)
   %src.clamp = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src.max, <2 x i16> <i16 255, i16 255>)
@@ -530,7 +530,6 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; SDAG-VI-LABEL: vec_smin_smax:
 ; SDAG-VI:       ; %bb.0:
-; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; SDAG-VI-NEXT:    v_min_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
@@ -538,27 +537,27 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; SDAG-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; SDAG-VI-NEXT:    v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9-LABEL: vec_smin_smax:
 ; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_movk_i32 s4, 0xff
 ; SDAG-GFX9-NEXT:    v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0]
 ; SDAG-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
+; SDAG-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: vec_smin_smax:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_i16 v0, v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: vec_smin_smax:
 ; GISEL-VI:       ; %bb.0:
-; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GISEL-VI-NEXT:    v_min_i16_e32 v2, 0xff, v0
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -566,14 +565,15 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0
 ; GISEL-VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX9-LABEL: vec_smin_smax:
 ; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, 0xff00ff
 ; GISEL-GFX9-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GISEL-GFX9-NEXT:    v_pk_max_i16 v0, v0, 0
+; GISEL-GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX9-NEXT:    s_setpc_b64 s[30:31]
   %src.min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %src, <2 x i16> <i16 255, i16 255>)
   %src.clamp = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src.min, <2 x i16> <i16 0, i16 0>)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 91cff9d1a5419d1..99c8f057e67839e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -6,23 +6,20 @@
 define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_23uu:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_23uu:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_23uu:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -33,32 +30,31 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_234u:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_234u:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_234u:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -69,23 +65,20 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u1u3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u1u3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_u1u3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -96,26 +89,26 @@ define <4 x half> @shuffle_v4f16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u3u1:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u3u1:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_u3u1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -126,23 +119,20 @@ define <4 x half> @shuffle_v4f16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_u3uu:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_u3uu:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_u3uu:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -153,34 +143,33 @@ define <4 x half> @shuffle_v4f16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3u6u:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3u6u:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_3u6u:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -191,34 +180,33 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3uu7:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3uu7:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_3uu7:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -229,32 +217,32 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_35u5:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_35u5:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_35u5:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -265,7 +253,6 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_357u:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -273,28 +260,29 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v4, v6, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_357u:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_357u:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -305,26 +293,26 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0101:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0101:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0101:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -335,23 +323,20 @@ define <4 x half> @shuffle_v4f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0123:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0123:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0123:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -362,32 +347,31 @@ define <4 x half> @shuffle_v4f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0145:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0145:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0145:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -398,32 +382,31 @@ define <4 x half> @shuffle_v4f16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0167:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0167:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0167:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -434,26 +417,26 @@ define <4 x half> @shuffle_v4f16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2301:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2301:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2301:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -464,26 +447,26 @@ define <4 x half> @shuffle_v4f16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2323:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2323:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2323:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -494,32 +477,31 @@ define <4 x half> @shuffle_v4f16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2345:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2345:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2345:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -530,32 +512,31 @@ define <4 x half> @shuffle_v4f16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2367:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2367:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2367:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -566,34 +547,33 @@ define <4 x half> @shuffle_v4f16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4501:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_4501:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_4501:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -604,34 +584,33 @@ define <4 x half> @shuffle_v4f16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4523:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_4523:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_4523:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -642,26 +621,26 @@ define <4 x half> @shuffle_v4f16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4545:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_4545:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_4545:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -672,23 +651,20 @@ define <4 x half> @shuffle_v4f16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_4567:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_4567:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_4567:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -699,34 +675,33 @@ define <4 x half> @shuffle_v4f16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6701:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6701:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6701:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -737,34 +712,33 @@ define <4 x half> @shuffle_v4f16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6723:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6723:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6723:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -775,26 +749,26 @@ define <4 x half> @shuffle_v4f16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6745:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6745:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6745:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -805,26 +779,26 @@ define <4 x half> @shuffle_v4f16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6767:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6767:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6767:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -835,34 +809,33 @@ define <4 x half> @shuffle_v4f16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2356:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2356:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2356:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -873,34 +846,33 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5623:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5623:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_5623:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -911,35 +883,35 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_3456:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_3456:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_3456:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -950,35 +922,35 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5634:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5634:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_5634:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -989,7 +961,6 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_5734:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -997,28 +968,29 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_5734:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_5734:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1029,34 +1001,33 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4i16_2356:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_2356:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i16_2356:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = load <4 x i16>, ptr addrspace(1) %arg1
@@ -1067,32 +1038,31 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
 define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4i16_0167:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_0167:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i16_0167:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = load <4 x i16>, ptr addrspace(1) %arg1
@@ -1103,31 +1073,31 @@ define <4 x i16> @shuffle_v4i16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %a
 define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0000:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0000:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0000:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1138,30 +1108,30 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_1010:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_1010:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_1010:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1172,31 +1142,31 @@ define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_1100:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_1100:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_1100:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1207,34 +1177,34 @@ define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6161:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v5, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6161:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6161:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1245,27 +1215,27 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_2333:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_2333:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_2333:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1276,27 +1246,27 @@ define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_6667:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_6667:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_6667:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1307,26 +1277,26 @@ define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_0101:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_0101:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_0101:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -1337,23 +1307,20 @@ define <4 x half> @shuffle_v8f16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_0123:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_0123:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_0123:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -1364,32 +1331,31 @@ define <4 x half> @shuffle_v8f16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_4589:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_4589:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_4589:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -1400,34 +1366,33 @@ define <4 x half> @shuffle_v8f16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_10_11_2_3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -1438,34 +1403,33 @@ define <4 x half> @shuffle_v8f16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace
 define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_13_14_2_3:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_13_14_2_3:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
 ; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -1476,27 +1440,27 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace
 define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v3f16_0122:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v1, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f16_0122:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f16_0122:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x half>, ptr addrspace(1) %arg0
   %val1 = load <3 x half>, ptr addrspace(1) %arg1
@@ -1507,26 +1471,26 @@ define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v2f16_0122:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f16_0122:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f16_0122:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x half>, ptr addrspace(1) %arg0
   %val1 = load <2 x half>, ptr addrspace(1) %arg1
@@ -1537,7 +1501,6 @@ define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %
 define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v6f16_452367:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v3
@@ -1548,11 +1511,11 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f16_452367:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v3
@@ -1563,11 +1526,11 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f16_452367:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
@@ -1575,6 +1538,7 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = load <6 x half>, ptr addrspace(1) %arg1
@@ -1672,7 +1636,6 @@ entry:
 define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_0456:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
@@ -1681,11 +1644,11 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_0456:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
 ; GFX10-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
@@ -1693,16 +1656,17 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f16_0456:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -1765,30 +1729,30 @@ attributes #0 = { nounwind readnone speculatable }
 define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: low16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -1801,30 +1765,30 @@ entry:
 define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -1837,30 +1801,30 @@ entry:
 define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: low16hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: low16hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: low16hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -1873,29 +1837,29 @@ entry:
 define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: hi16low16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: hi16low16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -1908,30 +1872,30 @@ entry:
 define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_low16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_low16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_low16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -1944,30 +1908,30 @@ entry:
 define <2 x i16> @i16_low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_low16hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v5
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_low16hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_low16hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -1980,29 +1944,29 @@ entry:
 define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_hi16low16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_hi16low16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_hi16low16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -2015,30 +1979,30 @@ entry:
 define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-LABEL: i16_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: i16_hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v5, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: i16_hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -2051,26 +2015,26 @@ entry:
 define <2 x i16> @v2i16_hi16bits(ptr addrspace(1) %x0) {
 ; GFX9-LABEL: v2i16_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v2i16_hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2i16_hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
@@ -2083,26 +2047,26 @@ entry:
 define <2 x half> @v2half_hi16bits(ptr addrspace(1) %x0) {
 ; GFX9-LABEL: v2half_hi16bits:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v2half_hi16bits:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v2half_hi16bits:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load0 = load <2 x half>, ptr addrspace(1) %x0, align 4
@@ -2115,30 +2079,29 @@ entry:
 define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v8f16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
@@ -2150,36 +2113,35 @@ define void @shuffle_v8f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v16f16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
@@ -2191,7 +2153,6 @@ define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v32f16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
 ; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
@@ -2204,12 +2165,11 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
@@ -2224,11 +2184,11 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
@@ -2243,6 +2203,7 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <16 x half>, ptr addrspace(1) %arg0
   %val1 = load <16 x half>, ptr addrspace(1) %arg1
@@ -2254,30 +2215,29 @@ define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v8i16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = load <4 x i16>, ptr addrspace(1) %arg1
@@ -2289,36 +2249,35 @@ define void @shuffle_v8i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v16i16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x i16>, ptr addrspace(1) %arg0
   %val1 = load <8 x i16>, ptr addrspace(1) %arg1
@@ -2330,7 +2289,6 @@ define void @shuffle_v16i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v32i16_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
 ; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
@@ -2343,12 +2301,11 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i16_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
@@ -2363,11 +2320,11 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i16_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
@@ -2382,6 +2339,7 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <16 x i16>, ptr addrspace(1) %arg0
   %val1 = load <16 x i16>, ptr addrspace(1) %arg1
@@ -2393,31 +2351,30 @@ define void @shuffle_v32i16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v4i8_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    global_load_short_d16_hi v0, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i8_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    global_load_short_d16_hi v0, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i8_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v[4:5], v0, off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = load <2 x i8>, ptr addrspace(1) %arg1
@@ -2429,30 +2386,29 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
 define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v8i8_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX9-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i8_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX10-NEXT:    global_load_dword v7, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[4:5], v[6:7], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i8_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i8>, ptr addrspace(1) %arg0
   %val1 = load <4 x i8>, ptr addrspace(1) %arg1
@@ -2464,30 +2420,29 @@ define void @shuffle_v8i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
 define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v16i8_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i8_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i8_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x i8>, ptr addrspace(1) %arg0
   %val1 = load <8 x i8>, ptr addrspace(1) %arg1
@@ -2499,36 +2454,35 @@ define void @shuffle_v16i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v32i8_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i8_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i8_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <16 x i8>, ptr addrspace(1) %arg0
   %val1 = load <16 x i8>, ptr addrspace(1) %arg1
@@ -2540,30 +2494,29 @@ define void @shuffle_v32i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v4i32_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i32_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i32_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x i32>, ptr addrspace(1) %arg0
   %val1 = load <2 x i32>, ptr addrspace(1) %arg1
@@ -2575,36 +2528,35 @@ define void @shuffle_v4i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v8i32_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i32_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i32_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i32>, ptr addrspace(1) %arg0
   %val1 = load <4 x i32>, ptr addrspace(1) %arg1
@@ -2616,7 +2568,6 @@ define void @shuffle_v8i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1
 define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
 ; GFX9-LABEL: shuffle_v16i32_concat:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
 ; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
@@ -2629,12 +2580,11 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i32_concat:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
 ; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
@@ -2649,11 +2599,11 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i32_concat:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
 ; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
@@ -2668,6 +2618,7 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x i32>, ptr addrspace(1) %arg0
   %val1 = load <8 x i32>, ptr addrspace(1) %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
index c7e1586a28fa14e..de00d05eceff9a8 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -5,7 +5,6 @@
 define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 8
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GCN-NEXT:  .LBB0_1: ; %bb0
@@ -25,6 +24,7 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3]
 ; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
 ; GCN-NEXT:    s_and_b32 s4, vcc_lo, s4
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[8:11], 0 offen
 ; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
@@ -40,7 +40,6 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ;
 ; GFX11-LABEL: vgpr_descriptor_waterfall_loop_idom_update:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB0_1: ; %bb0
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -57,8 +56,9 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(ptr %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3]
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_store_b32 v0, v0, s[4:7], 0 offen
 ; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a7182634..1d0cbf50db8832a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -10,9 +10,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; preserved across the call and should get 8 scratch registers.
 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
@@ -65,9 +65,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ;
 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -123,9 +123,9 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ;
 ; GFX11-LABEL: non_preserved_vgpr_tuple8:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -200,9 +200,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; The upper 3 sub-registers are unused.
 ; GFX9-LABEL: call_preserved_vgpr_tuple8:
 ; GFX9:       ; %bb.0: ; %main_body
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
@@ -249,9 +249,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ;
 ; GFX10-LABEL: call_preserved_vgpr_tuple8:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_waitcnt expcnt(0)
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -300,9 +300,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ;
 ; GFX11-LABEL: call_preserved_vgpr_tuple8:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_waitcnt expcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
index fe0afd28a34660a..a381dacb42867ba 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir
@@ -63,7 +63,6 @@ body: |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_WAITCNT 0
   ; GCN-NEXT:   $agpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.global4, addrspace 1)
   ; GCN-NEXT:   $agpr4_agpr5_agpr6_agpr7 = FLAT_LOAD_DWORDX4 $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %ir.global16, addrspace 1)
   ; GCN-NEXT:   S_WAITCNT 3953
@@ -118,7 +117,6 @@ body: |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_WAITCNT 0
   ; GCN-NEXT:   $agpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
@@ -149,7 +147,6 @@ body: |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   S_WAITCNT 0
   ; GCN-NEXT:   $agpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec, implicit $flat_scr
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -341,8 +338,7 @@ name: high_register_collision
 body: |
   bb.0:
     ; GCN-LABEL: name: high_register_collision
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec
+    ; GCN: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec
     ; GCN-NEXT: $vgpr226 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, implicit $exec, implicit $flat_scr
     ; GCN-NEXT: $vgpr4_vgpr5 = V_LSHLREV_B64_e64 4, $vgpr8_vgpr9, implicit $exec
     ; GCN-NEXT: FLAT_STORE_DWORD $vgpr4_vgpr5, $agpr1, 0, 0, implicit $exec, implicit $flat_scr
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
index 1e4e0ea017fc5aa..e318d8bb8f9892e 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-bvh.mir
@@ -6,8 +6,7 @@ name: waitcnt-check-inorder
 body: |
   bb.0:
     ; GCN-LABEL: name: waitcnt-check-inorder
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: S_ENDPGM 0
     $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
@@ -19,8 +18,7 @@ name: waitcnt-check-vs-vmem
 body: |
   bb.0:
     ; GCN-LABEL: name: waitcnt-check-vs-vmem
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_ENDPGM 0
@@ -33,8 +31,7 @@ name: waitcnt-check-vs-mimg-samp
 body: |
   bb.0:
     ; GCN-LABEL: name: waitcnt-check-vs-mimg-samp
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
+    ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
     ; GCN-NEXT: S_ENDPGM 0
@@ -47,8 +44,7 @@ name: waitcnt-check-vs-vmem-reverse
 body: |
   bb.0:
     ; GCN-LABEL: name: waitcnt-check-vs-vmem-reverse
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: S_ENDPGM 0
@@ -61,8 +57,7 @@ name: waitcnt-check-vs-mimg-samp-reverse
 body: |
   bb.0:
     ; GCN-LABEL: name: waitcnt-check-vs-mimg-samp-reverse
-    ; GCN: S_WAITCNT 0
-    ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+    ; GCN: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2 $vgpr16_vgpr17, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
     ; GCN-NEXT: S_WAITCNT 16240
     ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx10 killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
     ; GCN-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
index a9c567e1145be38..bf2c5de4e5698dd 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir
@@ -13,7 +13,6 @@ body:             |
     ; GCN-LABEL: name: waitcnt_kill
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: KILL $vgpr0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -30,7 +29,6 @@ body:             |
     ; GCN-LABEL: name: waitcnt_implicit_def
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -47,7 +45,6 @@ body:             |
     ; GCN-LABEL: name: waitcnt_eh_label
     ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: EH_LABEL <mcsymbol Ltmp0>, implicit $vgpr0
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -65,7 +62,6 @@ body:             |
     ; GCN-LABEL: name: waitcnt_cfi
     ; GCN: liveins: $vgpr0_vgpr1, $vgpr2
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: CFI_INSTRUCTION offset $vgpr0_lo16, 16
     $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
index 7afd63dd35ae6bc..01089e85f2f0e85 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir
@@ -23,7 +23,6 @@ body:             |
     ; GFX9-LABEL: name: max-counter-lgkmcnt
     ; GFX9: liveins: $vgpr99
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
     ; GFX9-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
     ; GFX9-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
@@ -42,7 +41,7 @@ body:             |
     ; GFX9-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
     ; GFX9-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     ; GFX9-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    ; GFX9-NEXT: S_WAITCNT 52863
+    ; GFX9-NEXT: S_WAITCNT 49279
     ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
@@ -51,7 +50,6 @@ body:             |
     ; GFX10-LABEL: name: max-counter-lgkmcnt
     ; GFX10: liveins: $vgpr99
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
     ; GFX10-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
     ; GFX10-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
@@ -70,19 +68,15 @@ body:             |
     ; GFX10-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
     ; GFX10-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     ; GFX10-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53631
+    ; GFX10-NEXT: S_WAITCNT 49279
     ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53375
     ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 53119
     ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
-    ; GFX10-NEXT: S_WAITCNT 52863
     ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0
     ; GFX11-LABEL: name: max-counter-lgkmcnt
     ; GFX11: liveins: $vgpr99
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
     ; GFX11-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec
     ; GFX11-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec
@@ -101,13 +95,10 @@ body:             |
     ; GFX11-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec
     ; GFX11-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec
     ; GFX11-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 64791
+    ; GFX11-NEXT: S_WAITCNT 64519
     ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 64775
     ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 64759
     ; GFX11-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 64743
     ; GFX11-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
     $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec
@@ -145,7 +136,6 @@ body:             |
     ; GFX9-LABEL: name: max-counter-vmcnt
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX9-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec
@@ -222,7 +212,6 @@ body:             |
     ; GFX10-LABEL: name: max-counter-vmcnt
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX10-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     ; GFX10-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec
@@ -299,7 +288,6 @@ body:             |
     ; GFX11-LABEL: name: max-counter-vmcnt
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec
     ; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec
@@ -457,9 +445,9 @@ body:             |
     ; GFX9-LABEL: name: max-counter-expcnt
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
     ; GFX9-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX9-NEXT: S_WAITCNT 53007
     ; GFX9-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX9-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX9-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
@@ -471,9 +459,9 @@ body:             |
     ; GFX10-LABEL: name: max-counter-expcnt
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
     ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX10-NEXT: S_WAITCNT 65295
     ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
@@ -485,9 +473,9 @@ body:             |
     ; GFX11-LABEL: name: max-counter-expcnt
     ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec
     ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
+    ; GFX11-NEXT: S_WAITCNT 65520
     ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
     ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
index 33c09167ef9461d..c68d85df2aec7cd 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir
@@ -11,7 +11,6 @@ body:             |
     ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GFX10-NEXT: S_BARRIER
@@ -22,9 +21,9 @@ body:             |
     ; GFX11-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
@@ -47,7 +46,6 @@ body:             |
     ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
     ; GFX10-NEXT: S_BARRIER
@@ -58,9 +56,9 @@ body:             |
     ; GFX11-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
@@ -83,7 +81,6 @@ body:             |
     ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 112
     ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
@@ -95,9 +92,8 @@ body:             |
     ; GFX11-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
-    ; GFX11-NEXT: S_WAITCNT 112
+    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
@@ -122,7 +118,6 @@ body:             |
     ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
     ; GFX10-NEXT: S_BARRIER
@@ -133,9 +128,9 @@ body:             |
     ; GFX11-LABEL: name: test_waitcnt_preexisting_vscnt_combined
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: S_BARRIER
     ; GFX11-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX11-NEXT: S_WAITCNT 7
@@ -160,7 +155,6 @@ body:             |
     ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX10-NEXT: S_WAITCNT 0
     ; GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 1
@@ -172,7 +166,6 @@ body:             |
     ; GFX11-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec
     ; GFX11-NEXT: S_WAITCNT 0
     ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
index 50d25f075669524..cb406924a0dc52e 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir
@@ -10,7 +10,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_unmodified
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 49279
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
@@ -33,7 +32,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_unmodified
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 3952
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
@@ -58,7 +56,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 112
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
@@ -81,7 +78,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 112
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
@@ -107,7 +103,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_apply_all_counters
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX9-NEXT: $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 0
@@ -132,7 +127,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
@@ -155,7 +149,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt_diff_counters
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: S_WAITCNT 112
     ; GFX9-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
@@ -177,7 +170,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_early_wait
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: S_NOP 0
@@ -203,7 +195,6 @@ body:             |
     ; GFX9-LABEL: name: test_waitcnt_preexisting_ignore_kill
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: S_WAITCNT 3952
     ; GFX9-NEXT: KILL $vgpr0
@@ -232,8 +223,7 @@ name:            test_waitcnt_preexisting_buffer_inv
 body:             |
   bb.0:
     ; GFX9-LABEL: name: test_waitcnt_preexisting_buffer_inv
-    ; GFX9: S_WAITCNT 0
-    ; GFX9-NEXT: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
+    ; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 3952
     ; GFX9-NEXT: BUFFER_INVL2 implicit $exec
     ; GFX9-NEXT: BUFFER_WBINVL1_VOL implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
index ac6cdbf7d563e97..e28c5dab4c2c673 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir
@@ -11,7 +11,6 @@ body: |
     ; GFX9-LABEL: name: buffer_buffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX9-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, implicit $exec
     $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
@@ -28,7 +27,6 @@ body: |
     ; GFX9-LABEL: name: tbuffer_tbuffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, implicit $exec
     ; GFX9-NEXT: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, implicit $exec
     $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, implicit $exec
@@ -46,7 +44,6 @@ body: |
     ; GFX9-LABEL: name: gather_gather
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; GFX9-NEXT: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -65,7 +62,6 @@ body: |
     ; GFX9-LABEL: name: nosampler_sampler
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; GFX9-NEXT: S_WAITCNT 3952
     ; GFX9-NEXT: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load (s128))
@@ -82,7 +78,6 @@ body: |
     ; GFX9-LABEL: name: global_scratch_buffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD $vgpr0, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
@@ -101,7 +96,6 @@ body: |
     ; GFX9-LABEL: name: flat_buffer
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     ; GFX9-NEXT: S_WAITCNT 49279
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
@@ -118,7 +112,6 @@ body: |
     ; GFX9-LABEL: name: buffer_flat
     ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: S_WAITCNT 0
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec
     ; GFX9-NEXT: S_WAITCNT 3952
     ; GFX9-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index c3ac778f82e0499..7ab07ed55aa439c 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1147,14 +1147,14 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa
 define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX1032-LABEL: test_mad_i64_i32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    v_mad_i64_i32 v[0:1], s4, v0, v1, v[2:3]
+; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1064-LABEL: test_mad_i64_i32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = sext i32 %arg0 to i64
   %sext1 = sext i32 %arg1 to i64
@@ -1166,14 +1166,14 @@ define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
 ; GFX1032-LABEL: test_mad_u64_u32:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s4, v0, v1, v[2:3]
+; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1064-LABEL: test_mad_u64_u32:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
+; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_setpc_b64 s[30:31]
   %sext0 = zext i32 %arg0 to i64
   %sext1 = zext i32 %arg1 to i64
@@ -2855,9 +2855,9 @@ declare void @external_void_func_void() #1
 define void @callee_no_stack_with_call() #1 {
 ; GFX1032-LABEL: callee_no_stack_with_call:
 ; GFX1032:       ; %bb.0:
-; GFX1032-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s16, s33
 ; GFX1032-NEXT:    s_mov_b32 s33, s32
+; GFX1032-NEXT:    s_waitcnt expcnt(0)
 ; GFX1032-NEXT:    s_or_saveexec_b32 s17, -1
 ; GFX1032-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
@@ -2886,9 +2886,9 @@ define void @callee_no_stack_with_call() #1 {
 ;
 ; GFX1064-LABEL: callee_no_stack_with_call:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_mov_b32 s16, s33
 ; GFX1064-NEXT:    s_mov_b32 s33, s32
+; GFX1064-NEXT:    s_waitcnt expcnt(0)
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GFX1064-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index cfb1cffdb509648..aca08a47d2c3de7 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -9,9 +9,9 @@
 define void @vector_reg_liverange_split() #0 {
 ; GFX90A-LABEL: vector_reg_liverange_split:
 ; GFX90A:       ; %bb.0:
-; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b32 s16, s33
 ; GFX90A-NEXT:    s_mov_b32 s33, s32
+; GFX90A-NEXT:    s_waitcnt expcnt(0)
 ; GFX90A-NEXT:    s_xor_saveexec_b64 s[18:19], -1
 ; GFX90A-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 7eabe982ff2bcde..6fbb6117b0819f5 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -15,9 +15,9 @@
 define void @test() #0 {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_waitcnt expcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
@@ -71,9 +71,9 @@ define void @test() #0 {
 ;
 ; GCN-O0-LABEL: test:
 ; GCN-O0:       ; %bb.0:
-; GCN-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b32 s16, s33
 ; GCN-O0-NEXT:    s_mov_b32 s33, s32
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    s_xor_saveexec_b64 s[18:19], -1
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index ab7d3ca0ab425f9..6d544ec0cf74b72 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -5,7 +5,7 @@
 define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-LABEL: strict_wwm_no_cfg:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -64,12 +64,12 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_no_cfg:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -109,7 +109,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
   %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
@@ -139,7 +139,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
 define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
 ; GFX9-O0-LABEL: strict_wwm_cfg:
 ; GFX9-O0:       ; %bb.0: ; %entry
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -256,12 +256,12 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_cfg:
 ; GFX9-O3:       ; %bb.0: ; %entry
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
@@ -308,7 +308,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
@@ -345,18 +345,18 @@ merge:
 define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline {
 ; GFX9-O0-LABEL: strict_wwm_called:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_called:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %a
   %mul = mul i32 %add, %a
@@ -367,9 +367,9 @@ define hidden amdgpu_gfx i32 @strict_wwm_called(i32 %a) noinline {
 define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -426,9 +426,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ;
 ; GFX9-O3-LABEL: strict_wwm_call:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -474,7 +474,6 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-LABEL: strict_wwm_called_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -535,11 +534,11 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s34, v[1:2]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_called_i64:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
@@ -548,6 +547,7 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
 ; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %a, %a
   %mul = mul i64 %add, %a
@@ -558,9 +558,9 @@ define amdgpu_gfx i64 @strict_wwm_called_i64(i64 %a) noinline {
 define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
 ; GFX9-O0-LABEL: strict_wwm_call_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
@@ -685,9 +685,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ;
 ; GFX9-O3-LABEL: strict_wwm_call_i64:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -754,7 +754,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
 ; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -831,12 +831,12 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
@@ -889,7 +889,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 ; GFX9-O3-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %tmp17 = shl i32 %index, 5
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
@@ -920,7 +920,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
 define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, ptr addrspace(5) %ptr, i64 %a, i64 %b, i64 %c, i64 %d, i64 %e) #0 {
 ; GFX9-O0-LABEL: strict_wwm_callee_saves:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
@@ -1258,12 +1258,12 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_callee_saves:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt expcnt(0)
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
@@ -1370,7 +1370,7 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O3-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %a2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %a, i64 0)
   %b2 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %b, i64 0)
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 743fb3e60a94dd5..5041ad92dc41483 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -306,18 +306,18 @@ merge:
 define hidden i32 @called(i32 %a) noinline {
 ; GFX9-O0-LABEL: called:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: called:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %a
   %mul = mul i32 %add, %a
@@ -478,7 +478,6 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
 define i64 @called_i64(i64 %a) noinline {
 ; GFX9-O0-LABEL: called_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -539,11 +538,11 @@ define i64 @called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: called_i64:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
@@ -552,6 +551,7 @@ define i64 @called_i64(i64 %a) noinline {
 ; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
 ; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %a, %a
   %mul = mul i64 %add, %a
@@ -1174,18 +1174,18 @@ merge:
 define hidden i32 @strict_wwm_called(i32 %a) noinline {
 ; GFX9-O0-LABEL: strict_wwm_called:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
 ; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_called:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %a
   %mul = mul i32 %add, %a
@@ -1346,7 +1346,6 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
 define i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-LABEL: strict_wwm_called_i64:
 ; GFX9-O0:       ; %bb.0:
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
@@ -1407,11 +1406,11 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-O3-LABEL: strict_wwm_called_i64:
 ; GFX9-O3:       ; %bb.0:
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
 ; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
 ; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
@@ -1420,6 +1419,7 @@ define i64 @strict_wwm_called_i64(i64 %a) noinline {
 ; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
 ; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %a, %a
   %mul = mul i64 %add, %a

>From 4f87e3cbfbe8056e948577622d67677ad1c5b183 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Fri, 1 Dec 2023 13:31:22 +0000
Subject: [PATCH 2/2] Simplify upper bound logic

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8fdfa0ac962f708..baee15913f90b3d 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1861,12 +1861,8 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     EntryBI.Incoming = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
     WaitcntBrackets &Brackets = *EntryBI.Incoming;
 
-    unsigned UB = 0;
     for (auto T : inst_counter_types())
-      UB = std::max(UB, Brackets.getWaitCountMax(T));
-
-    for (auto T : inst_counter_types())
-      Brackets.setScoreUB(T, UB);
+      Brackets.setScoreUB(T, Brackets.getWaitCountMax(T));
 
     if (ST->hasVscnt()) {
       Brackets.setPendingEvent(VMEM_READ_ACCESS);



More information about the llvm-commits mailing list